summaryrefslogtreecommitdiff
path: root/libs/ffvpx/libavcodec/x86
diff options
context:
space:
mode:
Diffstat (limited to 'libs/ffvpx/libavcodec/x86')
-rw-r--r--libs/ffvpx/libavcodec/x86/constants.c94
-rw-r--r--libs/ffvpx/libavcodec/x86/constants.h72
-rw-r--r--libs/ffvpx/libavcodec/x86/flacdsp.asm313
-rw-r--r--libs/ffvpx/libavcodec/x86/flacdsp_init.c115
-rw-r--r--libs/ffvpx/libavcodec/x86/h264_intrapred.asm2757
-rw-r--r--libs/ffvpx/libavcodec/x86/h264_intrapred_10bit.asm1199
-rw-r--r--libs/ffvpx/libavcodec/x86/h264_intrapred_init.c410
-rw-r--r--libs/ffvpx/libavcodec/x86/mathops.h133
-rw-r--r--libs/ffvpx/libavcodec/x86/moz.build34
-rw-r--r--libs/ffvpx/libavcodec/x86/videodsp.asm468
-rw-r--r--libs/ffvpx/libavcodec/x86/videodsp_init.c309
-rw-r--r--libs/ffvpx/libavcodec/x86/vp56_arith.h51
-rw-r--r--libs/ffvpx/libavcodec/x86/vp8dsp.asm1231
-rw-r--r--libs/ffvpx/libavcodec/x86/vp8dsp_init.c467
-rw-r--r--libs/ffvpx/libavcodec/x86/vp8dsp_loopfilter.asm1584
-rw-r--r--libs/ffvpx/libavcodec/x86/vp9dsp_init.c416
-rw-r--r--libs/ffvpx/libavcodec/x86/vp9dsp_init.h189
-rw-r--r--libs/ffvpx/libavcodec/x86/vp9dsp_init_10bpp.c25
-rw-r--r--libs/ffvpx/libavcodec/x86/vp9dsp_init_12bpp.c25
-rw-r--r--libs/ffvpx/libavcodec/x86/vp9dsp_init_16bpp.c149
-rw-r--r--libs/ffvpx/libavcodec/x86/vp9dsp_init_16bpp_template.c240
-rw-r--r--libs/ffvpx/libavcodec/x86/vp9intrapred.asm2044
-rw-r--r--libs/ffvpx/libavcodec/x86/vp9intrapred_16bpp.asm2392
-rw-r--r--libs/ffvpx/libavcodec/x86/vp9itxfm.asm3197
-rw-r--r--libs/ffvpx/libavcodec/x86/vp9itxfm_16bpp.asm2044
-rw-r--r--libs/ffvpx/libavcodec/x86/vp9itxfm_template.asm142
-rw-r--r--libs/ffvpx/libavcodec/x86/vp9lpf.asm1211
-rw-r--r--libs/ffvpx/libavcodec/x86/vp9lpf_16bpp.asm823
-rw-r--r--libs/ffvpx/libavcodec/x86/vp9mc.asm675
-rw-r--r--libs/ffvpx/libavcodec/x86/vp9mc_16bpp.asm431
30 files changed, 23240 insertions, 0 deletions
diff --git a/libs/ffvpx/libavcodec/x86/constants.c b/libs/ffvpx/libavcodec/x86/constants.c
new file mode 100644
index 000000000..4bfb78cc3
--- /dev/null
+++ b/libs/ffvpx/libavcodec/x86/constants.c
@@ -0,0 +1,94 @@
+/*
+ * MMX/SSE/AVX constants used across x86 dsp optimizations.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h" // for xmm_reg
+#include "constants.h"
+
+DECLARE_ALIGNED(32, const ymm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL,
+ 0x0001000100010001ULL, 0x0001000100010001ULL };
+DECLARE_ALIGNED(32, const ymm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL,
+ 0x0002000200020002ULL, 0x0002000200020002ULL };
+DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL };
+DECLARE_ASM_ALIGNED(32, const ymm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL,
+ 0x0004000400040004ULL, 0x0004000400040004ULL };
+DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL };
+DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL };
+DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL };
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
+DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL };
+DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL };
+DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL };
+DECLARE_ALIGNED(16, const xmm_reg, ff_pw_20) = { 0x0014001400140014ULL, 0x0014001400140014ULL };
+DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL };
+DECLARE_ASM_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL;
+DECLARE_ASM_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
+DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL };
+DECLARE_ASM_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
+DECLARE_ASM_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
+DECLARE_ALIGNED(32, const ymm_reg, ff_pw_255) = { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
+ 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL };
+DECLARE_ALIGNED(32, const ymm_reg, ff_pw_256) = { 0x0100010001000100ULL, 0x0100010001000100ULL,
+ 0x0100010001000100ULL, 0x0100010001000100ULL };
+DECLARE_ALIGNED(32, const ymm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL,
+ 0x0200020002000200ULL, 0x0200020002000200ULL };
+DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
+DECLARE_ALIGNED(32, const ymm_reg, ff_pw_1023) = { 0x03ff03ff03ff03ffULL, 0x03ff03ff03ff03ffULL,
+ 0x03ff03ff03ff03ffULL, 0x03ff03ff03ff03ffULL};
+DECLARE_ALIGNED(32, const ymm_reg, ff_pw_1024) = { 0x0400040004000400ULL, 0x0400040004000400ULL,
+ 0x0400040004000400ULL, 0x0400040004000400ULL};
+DECLARE_ALIGNED(32, const ymm_reg, ff_pw_2048) = { 0x0800080008000800ULL, 0x0800080008000800ULL,
+ 0x0800080008000800ULL, 0x0800080008000800ULL };
+DECLARE_ALIGNED(32, const ymm_reg, ff_pw_4095) = { 0x0fff0fff0fff0fffULL, 0x0fff0fff0fff0fffULL,
+ 0x0fff0fff0fff0fffULL, 0x0fff0fff0fff0fffULL };
+DECLARE_ALIGNED(32, const ymm_reg, ff_pw_4096) = { 0x1000100010001000ULL, 0x1000100010001000ULL,
+ 0x1000100010001000ULL, 0x1000100010001000ULL };
+DECLARE_ALIGNED(32, const ymm_reg, ff_pw_8192) = { 0x2000200020002000ULL, 0x2000200020002000ULL,
+ 0x2000200020002000ULL, 0x2000200020002000ULL };
+DECLARE_ALIGNED(32, const ymm_reg, ff_pw_m1) = { 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL,
+ 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL };
+
+DECLARE_ALIGNED(32, const ymm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL, 0x0000000000000000ULL };
+DECLARE_ALIGNED(32, const ymm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL,
+ 0x0101010101010101ULL, 0x0101010101010101ULL };
+DECLARE_ALIGNED(32, const ymm_reg, ff_pb_2) = { 0x0202020202020202ULL, 0x0202020202020202ULL,
+ 0x0202020202020202ULL, 0x0202020202020202ULL };
+DECLARE_ALIGNED(32, const ymm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL,
+ 0x0303030303030303ULL, 0x0303030303030303ULL };
+DECLARE_ALIGNED(32, const xmm_reg, ff_pb_15) = { 0x0F0F0F0F0F0F0F0FULL, 0x0F0F0F0F0F0F0F0FULL };
+DECLARE_ALIGNED(32, const ymm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL,
+ 0x8080808080808080ULL, 0x8080808080808080ULL };
+DECLARE_ALIGNED(32, const ymm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL,
+ 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
+DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
+
+DECLARE_ALIGNED(16, const xmm_reg, ff_ps_neg) = { 0x8000000080000000ULL, 0x8000000080000000ULL };
+
+DECLARE_ALIGNED(32, const ymm_reg, ff_pd_1) = { 0x0000000100000001ULL, 0x0000000100000001ULL,
+ 0x0000000100000001ULL, 0x0000000100000001ULL };
+DECLARE_ALIGNED(32, const ymm_reg, ff_pd_16) = { 0x0000001000000010ULL, 0x0000001000000010ULL,
+ 0x0000001000000010ULL, 0x0000001000000010ULL };
+DECLARE_ALIGNED(32, const ymm_reg, ff_pd_32) = { 0x0000002000000020ULL, 0x0000002000000020ULL,
+ 0x0000002000000020ULL, 0x0000002000000020ULL };
+DECLARE_ALIGNED(32, const ymm_reg, ff_pd_8192) = { 0x0000200000002000ULL, 0x0000200000002000ULL,
+ 0x0000200000002000ULL, 0x0000200000002000ULL };
+DECLARE_ALIGNED(32, const ymm_reg, ff_pd_65535)= { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL,
+ 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL };
diff --git a/libs/ffvpx/libavcodec/x86/constants.h b/libs/ffvpx/libavcodec/x86/constants.h
new file mode 100644
index 000000000..85da38b7b
--- /dev/null
+++ b/libs/ffvpx/libavcodec/x86/constants.h
@@ -0,0 +1,72 @@
+/*
+ * MMX/SSE constants used across x86 dsp optimizations.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_CONSTANTS_H
+#define AVCODEC_X86_CONSTANTS_H
+
+#include <stdint.h>
+
+#include "libavutil/x86/asm.h"
+
+extern const ymm_reg ff_pw_1;
+extern const ymm_reg ff_pw_2;
+extern const xmm_reg ff_pw_3;
+extern const ymm_reg ff_pw_4;
+extern const xmm_reg ff_pw_5;
+extern const xmm_reg ff_pw_8;
+extern const xmm_reg ff_pw_9;
+extern const uint64_t ff_pw_15;
+extern const xmm_reg ff_pw_16;
+extern const xmm_reg ff_pw_18;
+extern const xmm_reg ff_pw_20;
+extern const xmm_reg ff_pw_32;
+extern const uint64_t ff_pw_42;
+extern const uint64_t ff_pw_53;
+extern const xmm_reg ff_pw_64;
+extern const uint64_t ff_pw_96;
+extern const uint64_t ff_pw_128;
+extern const ymm_reg ff_pw_255;
+extern const ymm_reg ff_pw_256;
+extern const ymm_reg ff_pw_512;
+extern const ymm_reg ff_pw_1023;
+extern const ymm_reg ff_pw_1024;
+extern const ymm_reg ff_pw_2048;
+extern const ymm_reg ff_pw_4095;
+extern const ymm_reg ff_pw_4096;
+extern const ymm_reg ff_pw_8192;
+extern const ymm_reg ff_pw_m1;
+
+extern const ymm_reg ff_pb_0;
+extern const ymm_reg ff_pb_1;
+extern const ymm_reg ff_pb_2;
+extern const ymm_reg ff_pb_3;
+extern const ymm_reg ff_pb_80;
+extern const ymm_reg ff_pb_FE;
+extern const uint64_t ff_pb_FC;
+
+extern const xmm_reg ff_ps_neg;
+
+extern const ymm_reg ff_pd_1;
+extern const ymm_reg ff_pd_16;
+extern const ymm_reg ff_pd_32;
+extern const ymm_reg ff_pd_8192;
+extern const ymm_reg ff_pd_65535;
+
+#endif /* AVCODEC_X86_CONSTANTS_H */
diff --git a/libs/ffvpx/libavcodec/x86/flacdsp.asm b/libs/ffvpx/libavcodec/x86/flacdsp.asm
new file mode 100644
index 000000000..713861152
--- /dev/null
+++ b/libs/ffvpx/libavcodec/x86/flacdsp.asm
@@ -0,0 +1,313 @@
+;******************************************************************************
+;* FLAC DSP SIMD optimizations
+;*
+;* Copyright (C) 2014 Loren Merritt
+;* Copyright (C) 2014 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+%macro PMACSDQL 5
+%if cpuflag(xop)
+ pmacsdql %1, %2, %3, %1
+%else
+ pmuldq %2, %3
+ paddq %1, %2
+%endif
+%endmacro
+
+%macro LPC_32 1
+INIT_XMM %1
+cglobal flac_lpc_32, 5,6,5, decoded, coeffs, pred_order, qlevel, len, j
+ sub lend, pred_orderd
+ jle .ret
+ lea decodedq, [decodedq+pred_orderq*4-8]
+ lea coeffsq, [coeffsq+pred_orderq*4]
+ neg pred_orderq
+ movd m4, qlevelm
+ALIGN 16
+.loop_sample:
+ movd m0, [decodedq+pred_orderq*4+8]
+ add decodedq, 8
+ movd m1, [coeffsq+pred_orderq*4]
+ pxor m2, m2
+ pxor m3, m3
+ lea jq, [pred_orderq+1]
+ test jq, jq
+ jz .end_order
+.loop_order:
+ PMACSDQL m2, m0, m1, m2, m0
+ movd m0, [decodedq+jq*4]
+ PMACSDQL m3, m1, m0, m3, m1
+ movd m1, [coeffsq+jq*4]
+ inc jq
+ jl .loop_order
+.end_order:
+ PMACSDQL m2, m0, m1, m2, m0
+ psrlq m2, m4
+ movd m0, [decodedq]
+ paddd m0, m2
+ movd [decodedq], m0
+ sub lend, 2
+ jl .ret
+ PMACSDQL m3, m1, m0, m3, m1
+ psrlq m3, m4
+ movd m1, [decodedq+4]
+ paddd m1, m3
+ movd [decodedq+4], m1
+ jg .loop_sample
+.ret:
+ REP_RET
+%endmacro
+
+%if HAVE_XOP_EXTERNAL
+LPC_32 xop
+%endif
+LPC_32 sse4
+
+;----------------------------------------------------------------------------------
+;void ff_flac_decorrelate_[lrm]s_16_sse2(uint8_t **out, int32_t **in, int channels,
+; int len, int shift);
+;----------------------------------------------------------------------------------
+%macro FLAC_DECORRELATE_16 3-4
+cglobal flac_decorrelate_%1_16, 2, 4, 4, out, in0, in1, len
+%if ARCH_X86_32
+ mov lend, lenm
+%endif
+ movd m3, r4m
+ shl lend, 2
+ mov in1q, [in0q + gprsize]
+ mov in0q, [in0q]
+ mov outq, [outq]
+ add in1q, lenq
+ add in0q, lenq
+ add outq, lenq
+ neg lenq
+
+align 16
+.loop:
+ mova m0, [in0q + lenq]
+ mova m1, [in1q + lenq]
+%ifidn %1, ms
+ psrad m2, m1, 1
+ psubd m0, m2
+%endif
+%ifnidn %1, indep2
+ p%4d m2, m0, m1
+%endif
+ packssdw m%2, m%2
+ packssdw m%3, m%3
+ punpcklwd m%2, m%3
+ psllw m%2, m3
+ mova [outq + lenq], m%2
+ add lenq, 16
+ jl .loop
+ REP_RET
+%endmacro
+
+INIT_XMM sse2
+FLAC_DECORRELATE_16 ls, 0, 2, sub
+FLAC_DECORRELATE_16 rs, 2, 1, add
+FLAC_DECORRELATE_16 ms, 2, 0, add
+
+;----------------------------------------------------------------------------------
+;void ff_flac_decorrelate_[lrm]s_32_sse2(uint8_t **out, int32_t **in, int channels,
+; int len, int shift);
+;----------------------------------------------------------------------------------
+%macro FLAC_DECORRELATE_32 5
+cglobal flac_decorrelate_%1_32, 2, 4, 4, out, in0, in1, len
+%if ARCH_X86_32
+ mov lend, lenm
+%endif
+ movd m3, r4m
+ mov in1q, [in0q + gprsize]
+ mov in0q, [in0q]
+ mov outq, [outq]
+ sub in1q, in0q
+
+align 16
+.loop:
+ mova m0, [in0q]
+ mova m1, [in0q + in1q]
+%ifidn %1, ms
+ psrad m2, m1, 1
+ psubd m0, m2
+%endif
+ p%5d m2, m0, m1
+ pslld m%2, m3
+ pslld m%3, m3
+
+ SBUTTERFLY dq, %2, %3, %4
+
+ mova [outq ], m%2
+ mova [outq + mmsize], m%3
+
+ add in0q, mmsize
+ add outq, mmsize*2
+ sub lend, mmsize/4
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_XMM sse2
+FLAC_DECORRELATE_32 ls, 0, 2, 1, sub
+FLAC_DECORRELATE_32 rs, 2, 1, 0, add
+FLAC_DECORRELATE_32 ms, 2, 0, 1, add
+
+;-----------------------------------------------------------------------------------------
+;void ff_flac_decorrelate_indep<ch>_<bps>_<opt>(uint8_t **out, int32_t **in, int channels,
+; int len, int shift);
+;-----------------------------------------------------------------------------------------
+;%1 = bps
+;%2 = channels
+;%3 = last xmm reg used
+;%4 = word/dword (shift instruction)
+%macro FLAC_DECORRELATE_INDEP 4
+%define REPCOUNT %2/(32/%1) ; 16bits = channels / 2; 32bits = channels
+cglobal flac_decorrelate_indep%2_%1, 2, %2+2, %3+1, out, in0, in1, len, in2, in3, in4, in5, in6, in7
+%if ARCH_X86_32
+%if %2 == 6
+ DEFINE_ARGS out, in0, in1, in2, in3, in4, in5
+ %define lend dword r3m
+%else
+ mov lend, lenm
+%endif
+%endif
+ movd m%3, r4m
+
+%assign %%i 1
+%rep %2-1
+ mov in %+ %%i %+ q, [in0q+%%i*gprsize]
+%assign %%i %%i+1
+%endrep
+
+ mov in0q, [in0q]
+ mov outq, [outq]
+
+%assign %%i 1
+%rep %2-1
+ sub in %+ %%i %+ q, in0q
+%assign %%i %%i+1
+%endrep
+
+align 16
+.loop:
+ mova m0, [in0q]
+
+%assign %%i 1
+%rep REPCOUNT-1
+ mova m %+ %%i, [in0q + in %+ %%i %+ q]
+%assign %%i %%i+1
+%endrep
+
+%if %1 == 32
+
+%if %2 == 8
+ TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, 8
+%elif %2 == 6
+ SBUTTERFLY dq, 0, 1, 6
+ SBUTTERFLY dq, 2, 3, 6
+ SBUTTERFLY dq, 4, 5, 6
+
+ punpcklqdq m6, m0, m2
+ punpckhqdq m2, m4
+ shufps m4, m0, 0xe4
+ punpcklqdq m0, m1, m3
+ punpckhqdq m3, m5
+ shufps m5, m1, 0xe4
+ SWAP 0,6,1,4,5,3
+%elif %2 == 4
+ TRANSPOSE4x4D 0, 1, 2, 3, 4
+%else ; %2 == 2
+ SBUTTERFLY dq, 0, 1, 2
+%endif
+
+%else ; %1 == 16
+
+%if %2 == 8
+ packssdw m0, [in0q + in4q]
+ packssdw m1, [in0q + in5q]
+ packssdw m2, [in0q + in6q]
+ packssdw m3, [in0q + in7q]
+ TRANSPOSE2x4x4W 0, 1, 2, 3, 4
+%elif %2 == 6
+ packssdw m0, [in0q + in3q]
+ packssdw m1, [in0q + in4q]
+ packssdw m2, [in0q + in5q]
+ pshufd m3, m0, q1032
+ punpcklwd m0, m1
+ punpckhwd m1, m2
+ punpcklwd m2, m3
+
+ shufps m3, m0, m2, q2020
+ shufps m0, m1, q2031
+ shufps m2, m1, q3131
+ shufps m1, m2, m3, q3120
+ shufps m3, m0, q0220
+ shufps m0, m2, q3113
+ SWAP 2, 0, 3
+%else ; %2 == 4
+ packssdw m0, [in0q + in2q]
+ packssdw m1, [in0q + in3q]
+ SBUTTERFLY wd, 0, 1, 2
+ SBUTTERFLY dq, 0, 1, 2
+%endif
+
+%endif
+
+%assign %%i 0
+%rep REPCOUNT
+ psll%4 m %+ %%i, m%3
+%assign %%i %%i+1
+%endrep
+
+%assign %%i 0
+%rep REPCOUNT
+ mova [outq + %%i*mmsize], m %+ %%i
+%assign %%i %%i+1
+%endrep
+
+ add in0q, mmsize
+ add outq, mmsize*REPCOUNT
+ sub lend, mmsize/4
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_XMM sse2
+FLAC_DECORRELATE_16 indep2, 0, 1 ; Reuse stereo 16bits macro
+FLAC_DECORRELATE_INDEP 32, 2, 3, d
+FLAC_DECORRELATE_INDEP 16, 4, 3, w
+FLAC_DECORRELATE_INDEP 32, 4, 5, d
+FLAC_DECORRELATE_INDEP 16, 6, 4, w
+FLAC_DECORRELATE_INDEP 32, 6, 7, d
+%if ARCH_X86_64
+FLAC_DECORRELATE_INDEP 16, 8, 5, w
+FLAC_DECORRELATE_INDEP 32, 8, 9, d
+%endif
+
+INIT_XMM avx
+FLAC_DECORRELATE_INDEP 32, 4, 5, d
+FLAC_DECORRELATE_INDEP 32, 6, 7, d
+%if ARCH_X86_64
+FLAC_DECORRELATE_INDEP 16, 8, 5, w
+FLAC_DECORRELATE_INDEP 32, 8, 9, d
+%endif
diff --git a/libs/ffvpx/libavcodec/x86/flacdsp_init.c b/libs/ffvpx/libavcodec/x86/flacdsp_init.c
new file mode 100644
index 000000000..1971f81b8
--- /dev/null
+++ b/libs/ffvpx/libavcodec/x86/flacdsp_init.c
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2014 James Almer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/flacdsp.h"
+#include "libavutil/x86/cpu.h"
+#include "config.h"
+
+void ff_flac_lpc_32_sse4(int32_t *samples, const int coeffs[32], int order,
+ int qlevel, int len);
+void ff_flac_lpc_32_xop(int32_t *samples, const int coeffs[32], int order,
+ int qlevel, int len);
+
+void ff_flac_enc_lpc_16_sse4(int32_t *, const int32_t *, int, int, const int32_t *,int);
+
+#define DECORRELATE_FUNCS(fmt, opt) \
+void ff_flac_decorrelate_ls_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
+ int len, int shift); \
+void ff_flac_decorrelate_rs_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
+ int len, int shift); \
+void ff_flac_decorrelate_ms_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
+ int len, int shift); \
+void ff_flac_decorrelate_indep2_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
+ int len, int shift); \
+void ff_flac_decorrelate_indep4_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
+ int len, int shift); \
+void ff_flac_decorrelate_indep6_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
+ int len, int shift); \
+void ff_flac_decorrelate_indep8_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
+ int len, int shift)
+
+DECORRELATE_FUNCS(16, sse2);
+DECORRELATE_FUNCS(16, avx);
+DECORRELATE_FUNCS(32, sse2);
+DECORRELATE_FUNCS(32, avx);
+
+av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int channels,
+ int bps)
+{
+#if HAVE_X86ASM
+ int cpu_flags = av_get_cpu_flags();
+
+#if CONFIG_FLAC_DECODER
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ if (fmt == AV_SAMPLE_FMT_S16) {
+ if (channels == 2)
+ c->decorrelate[0] = ff_flac_decorrelate_indep2_16_sse2;
+ else if (channels == 4)
+ c->decorrelate[0] = ff_flac_decorrelate_indep4_16_sse2;
+ else if (channels == 6)
+ c->decorrelate[0] = ff_flac_decorrelate_indep6_16_sse2;
+ else if (ARCH_X86_64 && channels == 8)
+ c->decorrelate[0] = ff_flac_decorrelate_indep8_16_sse2;
+ c->decorrelate[1] = ff_flac_decorrelate_ls_16_sse2;
+ c->decorrelate[2] = ff_flac_decorrelate_rs_16_sse2;
+ c->decorrelate[3] = ff_flac_decorrelate_ms_16_sse2;
+ } else if (fmt == AV_SAMPLE_FMT_S32) {
+ if (channels == 2)
+ c->decorrelate[0] = ff_flac_decorrelate_indep2_32_sse2;
+ else if (channels == 4)
+ c->decorrelate[0] = ff_flac_decorrelate_indep4_32_sse2;
+ else if (channels == 6)
+ c->decorrelate[0] = ff_flac_decorrelate_indep6_32_sse2;
+ else if (ARCH_X86_64 && channels == 8)
+ c->decorrelate[0] = ff_flac_decorrelate_indep8_32_sse2;
+ c->decorrelate[1] = ff_flac_decorrelate_ls_32_sse2;
+ c->decorrelate[2] = ff_flac_decorrelate_rs_32_sse2;
+ c->decorrelate[3] = ff_flac_decorrelate_ms_32_sse2;
+ }
+ }
+ if (EXTERNAL_SSE4(cpu_flags)) {
+ c->lpc32 = ff_flac_lpc_32_sse4;
+ }
+ if (EXTERNAL_AVX(cpu_flags)) {
+ if (fmt == AV_SAMPLE_FMT_S16) {
+ if (ARCH_X86_64 && channels == 8)
+ c->decorrelate[0] = ff_flac_decorrelate_indep8_16_avx;
+ } else if (fmt == AV_SAMPLE_FMT_S32) {
+ if (channels == 4)
+ c->decorrelate[0] = ff_flac_decorrelate_indep4_32_avx;
+ else if (channels == 6)
+ c->decorrelate[0] = ff_flac_decorrelate_indep6_32_avx;
+ else if (ARCH_X86_64 && channels == 8)
+ c->decorrelate[0] = ff_flac_decorrelate_indep8_32_avx;
+ }
+ }
+ if (EXTERNAL_XOP(cpu_flags)) {
+ c->lpc32 = ff_flac_lpc_32_xop;
+ }
+#endif
+
+#if CONFIG_FLAC_ENCODER
+ if (EXTERNAL_SSE4(cpu_flags)) {
+ if (CONFIG_GPL)
+ c->lpc16_encode = ff_flac_enc_lpc_16_sse4;
+ }
+#endif
+#endif /* HAVE_X86ASM */
+}
diff --git a/libs/ffvpx/libavcodec/x86/h264_intrapred.asm b/libs/ffvpx/libavcodec/x86/h264_intrapred.asm
new file mode 100644
index 000000000..f3aa3172f
--- /dev/null
+++ b/libs/ffvpx/libavcodec/x86/h264_intrapred.asm
@@ -0,0 +1,2757 @@
+;******************************************************************************
+;* H.264 intra prediction asm optimizations
+;* Copyright (c) 2010 Fiona Glaser
+;* Copyright (c) 2010 Holger Lubitz
+;* Copyright (c) 2010 Loren Merritt
+;* Copyright (c) 2010 Ronald S. Bultje
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+tm_shuf: times 8 db 0x03, 0x80
+pw_ff00: times 8 dw 0xff00
+plane_shuf: db -8, -7, -6, -5, -4, -3, -2, -1
+ db 1, 2, 3, 4, 5, 6, 7, 8
+plane8_shuf: db -4, -3, -2, -1, 0, 0, 0, 0
+ db 1, 2, 3, 4, 0, 0, 0, 0
+pw_0to7: dw 0, 1, 2, 3, 4, 5, 6, 7
+pw_1to8: dw 1, 2, 3, 4, 5, 6, 7, 8
+pw_m8tom1: dw -8, -7, -6, -5, -4, -3, -2, -1
+pw_m4to4: dw -4, -3, -2, -1, 1, 2, 3, 4
+
+SECTION .text
+
+cextern pb_1
+cextern pb_3
+cextern pw_4
+cextern pw_5
+cextern pw_8
+cextern pw_16
+cextern pw_17
+cextern pw_32
+
+;-----------------------------------------------------------------------------
+; void ff_pred16x16_vertical_8(uint8_t *src, ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+
+INIT_MMX mmx
+cglobal pred16x16_vertical_8, 2,3
+ sub r0, r1
+ mov r2, 8
+ movq mm0, [r0+0]
+ movq mm1, [r0+8]
+.loop:
+ movq [r0+r1*1+0], mm0
+ movq [r0+r1*1+8], mm1
+ movq [r0+r1*2+0], mm0
+ movq [r0+r1*2+8], mm1
+ lea r0, [r0+r1*2]
+ dec r2
+ jg .loop
+ REP_RET
+
+INIT_XMM sse
+cglobal pred16x16_vertical_8, 2,3
+ sub r0, r1
+ mov r2, 4
+ movaps xmm0, [r0]
+.loop:
+ movaps [r0+r1*1], xmm0
+ movaps [r0+r1*2], xmm0
+ lea r0, [r0+r1*2]
+ movaps [r0+r1*1], xmm0
+ movaps [r0+r1*2], xmm0
+ lea r0, [r0+r1*2]
+ dec r2
+ jg .loop
+ REP_RET
+
+;-----------------------------------------------------------------------------
+; void ff_pred16x16_horizontal_8(uint8_t *src, ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+
+%macro PRED16x16_H 0
+cglobal pred16x16_horizontal_8, 2,3
+ mov r2, 8
+%if cpuflag(ssse3)
+ mova m2, [pb_3]
+%endif
+.loop:
+ movd m0, [r0+r1*0-4]
+ movd m1, [r0+r1*1-4]
+
+%if cpuflag(ssse3)
+ pshufb m0, m2
+ pshufb m1, m2
+%else
+ punpcklbw m0, m0
+ punpcklbw m1, m1
+ SPLATW m0, m0, 3
+ SPLATW m1, m1, 3
+ mova [r0+r1*0+8], m0
+ mova [r0+r1*1+8], m1
+%endif
+
+ mova [r0+r1*0], m0
+ mova [r0+r1*1], m1
+ lea r0, [r0+r1*2]
+ dec r2
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmx
+PRED16x16_H
+INIT_MMX mmxext
+PRED16x16_H
+INIT_XMM ssse3
+PRED16x16_H
+
+;-----------------------------------------------------------------------------
+; void ff_pred16x16_dc_8(uint8_t *src, ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+
+%macro PRED16x16_DC 0
+cglobal pred16x16_dc_8, 2,7
+ mov r4, r0
+ sub r0, r1
+ pxor mm0, mm0
+ pxor mm1, mm1
+ psadbw mm0, [r0+0]
+ psadbw mm1, [r0+8]
+ dec r0
+ movzx r5d, byte [r0+r1*1]
+ paddw mm0, mm1
+ movd r6d, mm0
+ lea r0, [r0+r1*2]
+%rep 7
+ movzx r2d, byte [r0+r1*0]
+ movzx r3d, byte [r0+r1*1]
+ add r5d, r2d
+ add r6d, r3d
+ lea r0, [r0+r1*2]
+%endrep
+ movzx r2d, byte [r0+r1*0]
+ add r5d, r6d
+ lea r2d, [r2+r5+16]
+ shr r2d, 5
+%if cpuflag(ssse3)
+ pxor m1, m1
+%endif
+ SPLATB_REG m0, r2, m1
+
+%if mmsize==8
+ mov r3d, 8
+.loop:
+ mova [r4+r1*0+0], m0
+ mova [r4+r1*0+8], m0
+ mova [r4+r1*1+0], m0
+ mova [r4+r1*1+8], m0
+%else
+ mov r3d, 4
+.loop:
+ mova [r4+r1*0], m0
+ mova [r4+r1*1], m0
+ lea r4, [r4+r1*2]
+ mova [r4+r1*0], m0
+ mova [r4+r1*1], m0
+%endif
+ lea r4, [r4+r1*2]
+ dec r3d
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PRED16x16_DC
+INIT_XMM sse2
+PRED16x16_DC
+INIT_XMM ssse3
+PRED16x16_DC
+
+;-----------------------------------------------------------------------------
+; void ff_pred16x16_tm_vp8_8(uint8_t *src, ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+
+%macro PRED16x16_TM 0
+cglobal pred16x16_tm_vp8_8, 2,5
+ sub r0, r1
+ pxor mm7, mm7
+ movq mm0, [r0+0]
+ movq mm2, [r0+8]
+ movq mm1, mm0
+ movq mm3, mm2
+ punpcklbw mm0, mm7
+ punpckhbw mm1, mm7
+ punpcklbw mm2, mm7
+ punpckhbw mm3, mm7
+ movzx r3d, byte [r0-1]
+ mov r4d, 16
+.loop:
+ movzx r2d, byte [r0+r1-1]
+ sub r2d, r3d
+ movd mm4, r2d
+ SPLATW mm4, mm4, 0
+ movq mm5, mm4
+ movq mm6, mm4
+ movq mm7, mm4
+ paddw mm4, mm0
+ paddw mm5, mm1
+ paddw mm6, mm2
+ paddw mm7, mm3
+ packuswb mm4, mm5
+ packuswb mm6, mm7
+ movq [r0+r1+0], mm4
+ movq [r0+r1+8], mm6
+ add r0, r1
+ dec r4d
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmx
+PRED16x16_TM
+INIT_MMX mmxext
+PRED16x16_TM
+
+INIT_XMM sse2
+cglobal pred16x16_tm_vp8_8, 2,6,6
+ sub r0, r1
+ pxor xmm2, xmm2
+ movdqa xmm0, [r0]
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm2
+ punpckhbw xmm1, xmm2
+ movzx r4d, byte [r0-1]
+ mov r5d, 8
+.loop:
+ movzx r2d, byte [r0+r1*1-1]
+ movzx r3d, byte [r0+r1*2-1]
+ sub r2d, r4d
+ sub r3d, r4d
+ movd xmm2, r2d
+ movd xmm4, r3d
+ pshuflw xmm2, xmm2, 0
+ pshuflw xmm4, xmm4, 0
+ punpcklqdq xmm2, xmm2
+ punpcklqdq xmm4, xmm4
+ movdqa xmm3, xmm2
+ movdqa xmm5, xmm4
+ paddw xmm2, xmm0
+ paddw xmm3, xmm1
+ paddw xmm4, xmm0
+ paddw xmm5, xmm1
+ packuswb xmm2, xmm3
+ packuswb xmm4, xmm5
+ movdqa [r0+r1*1], xmm2
+ movdqa [r0+r1*2], xmm4
+ lea r0, [r0+r1*2]
+ dec r5d
+ jg .loop
+ REP_RET
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal pred16x16_tm_vp8_8, 2, 4, 5, dst, stride, stride3, iteration
+ sub dstq, strideq
+ pmovzxbw m0, [dstq]
+ vpbroadcastb xm1, [r0-1]
+ pmovzxbw m1, xm1
+ psubw m0, m1
+ mov iterationd, 4
+ lea stride3q, [strideq*3]
+.loop:
+ vpbroadcastb xm1, [dstq+strideq*1-1]
+ vpbroadcastb xm2, [dstq+strideq*2-1]
+ vpbroadcastb xm3, [dstq+stride3q-1]
+ vpbroadcastb xm4, [dstq+strideq*4-1]
+ pmovzxbw m1, xm1
+ pmovzxbw m2, xm2
+ pmovzxbw m3, xm3
+ pmovzxbw m4, xm4
+ paddw m1, m0
+ paddw m2, m0
+ paddw m3, m0
+ paddw m4, m0
+ vpackuswb m1, m1, m2
+ vpackuswb m3, m3, m4
+ vpermq m1, m1, q3120
+ vpermq m3, m3, q3120
+ movdqa [dstq+strideq*1], xm1
+ vextracti128 [dstq+strideq*2], m1, 1
+ movdqa [dstq+stride3q*1], xm3
+ vextracti128 [dstq+strideq*4], m3, 1
+ lea dstq, [dstq+strideq*4]
+ dec iterationd
+ jg .loop
+ REP_RET
+%endif
+
+;-----------------------------------------------------------------------------
+; void ff_pred16x16_plane_*_8(uint8_t *src, ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+
+%macro H264_PRED16x16_PLANE 1
+cglobal pred16x16_plane_%1_8, 2,9,7
+ mov r2, r1 ; +stride
+ neg r1 ; -stride
+
+ movh m0, [r0+r1 -1]
+%if mmsize == 8
+ pxor m4, m4
+ movh m1, [r0+r1 +3 ]
+ movh m2, [r0+r1 +8 ]
+ movh m3, [r0+r1 +12]
+ punpcklbw m0, m4
+ punpcklbw m1, m4
+ punpcklbw m2, m4
+ punpcklbw m3, m4
+ pmullw m0, [pw_m8tom1 ]
+ pmullw m1, [pw_m8tom1+8]
+ pmullw m2, [pw_1to8 ]
+ pmullw m3, [pw_1to8 +8]
+ paddw m0, m2
+ paddw m1, m3
+%else ; mmsize == 16
+%if cpuflag(ssse3)
+ movhps m0, [r0+r1 +8]
+ pmaddubsw m0, [plane_shuf] ; H coefficients
+%else ; sse2
+ pxor m2, m2
+ movh m1, [r0+r1 +8]
+ punpcklbw m0, m2
+ punpcklbw m1, m2
+ pmullw m0, [pw_m8tom1]
+ pmullw m1, [pw_1to8]
+ paddw m0, m1
+%endif
+ movhlps m1, m0
+%endif
+ paddw m0, m1
+%if cpuflag(mmxext)
+ PSHUFLW m1, m0, 0xE
+%elif cpuflag(mmx)
+ mova m1, m0
+ psrlq m1, 32
+%endif
+ paddw m0, m1
+%if cpuflag(mmxext)
+ PSHUFLW m1, m0, 0x1
+%elif cpuflag(mmx)
+ mova m1, m0
+ psrlq m1, 16
+%endif
+ paddw m0, m1 ; sum of H coefficients
+
+ lea r4, [r0+r2*8-1]
+ lea r3, [r0+r2*4-1]
+ add r4, r2
+
+%if ARCH_X86_64
+%define e_reg r8
+%else
+%define e_reg r0
+%endif
+
+ movzx e_reg, byte [r3+r2*2 ]
+ movzx r5, byte [r4+r1 ]
+ sub r5, e_reg
+
+ movzx e_reg, byte [r3+r2 ]
+ movzx r6, byte [r4 ]
+ sub r6, e_reg
+ lea r5, [r5+r6*2]
+
+ movzx e_reg, byte [r3+r1 ]
+ movzx r6, byte [r4+r2*2 ]
+ sub r6, e_reg
+ lea r5, [r5+r6*4]
+
+ movzx e_reg, byte [r3 ]
+%if ARCH_X86_64
+ movzx r7, byte [r4+r2 ]
+ sub r7, e_reg
+%else
+ movzx r6, byte [r4+r2 ]
+ sub r6, e_reg
+ lea r5, [r5+r6*4]
+ sub r5, r6
+%endif
+
+ lea e_reg, [r3+r1*4]
+ lea r3, [r4+r2*4]
+
+ movzx r4, byte [e_reg+r2 ]
+ movzx r6, byte [r3 ]
+ sub r6, r4
+%if ARCH_X86_64
+ lea r6, [r7+r6*2]
+ lea r5, [r5+r6*2]
+ add r5, r6
+%else
+ lea r5, [r5+r6*4]
+ lea r5, [r5+r6*2]
+%endif
+
+ movzx r4, byte [e_reg ]
+%if ARCH_X86_64
+ movzx r7, byte [r3 +r2 ]
+ sub r7, r4
+ sub r5, r7
+%else
+ movzx r6, byte [r3 +r2 ]
+ sub r6, r4
+ lea r5, [r5+r6*8]
+ sub r5, r6
+%endif
+
+ movzx r4, byte [e_reg+r1 ]
+ movzx r6, byte [r3 +r2*2]
+ sub r6, r4
+%if ARCH_X86_64
+ add r6, r7
+%endif
+ lea r5, [r5+r6*8]
+
+ movzx r4, byte [e_reg+r2*2]
+ movzx r6, byte [r3 +r1 ]
+ sub r6, r4
+ lea r5, [r5+r6*4]
+ add r5, r6 ; sum of V coefficients
+
+%if ARCH_X86_64 == 0
+ mov r0, r0m
+%endif
+
+%ifidn %1, h264
+ lea r5, [r5*5+32]
+ sar r5, 6
+%elifidn %1, rv40
+ lea r5, [r5*5]
+ sar r5, 6
+%elifidn %1, svq3
+ test r5, r5
+ lea r6, [r5+3]
+ cmovs r5, r6
+ sar r5, 2 ; V/4
+ lea r5, [r5*5] ; 5*(V/4)
+ test r5, r5
+ lea r6, [r5+15]
+ cmovs r5, r6
+ sar r5, 4 ; (5*(V/4))/16
+%endif
+
+ movzx r4, byte [r0+r1 +15]
+ movzx r3, byte [r3+r2*2 ]
+ lea r3, [r3+r4+1]
+ shl r3, 4
+
+ movd r1d, m0
+ movsx r1d, r1w
+%ifnidn %1, svq3
+%ifidn %1, h264
+ lea r1d, [r1d*5+32]
+%else ; rv40
+ lea r1d, [r1d*5]
+%endif
+ sar r1d, 6
+%else ; svq3
+ test r1d, r1d
+ lea r4d, [r1d+3]
+ cmovs r1d, r4d
+ sar r1d, 2 ; H/4
+ lea r1d, [r1d*5] ; 5*(H/4)
+ test r1d, r1d
+ lea r4d, [r1d+15]
+ cmovs r1d, r4d
+ sar r1d, 4 ; (5*(H/4))/16
+%endif
+ movd m0, r1d
+
+ add r1d, r5d
+ add r3d, r1d
+ shl r1d, 3
+ sub r3d, r1d ; a
+
+ movd m1, r5d
+ movd m3, r3d
+ SPLATW m0, m0, 0 ; H
+ SPLATW m1, m1, 0 ; V
+ SPLATW m3, m3, 0 ; a
+%ifidn %1, svq3
+ SWAP 0, 1
+%endif
+ mova m2, m0
+%if mmsize == 8
+ mova m5, m0
+%endif
+ pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
+%if mmsize == 16
+ psllw m2, 3
+%else
+ psllw m5, 3
+ psllw m2, 2
+ mova m6, m5
+ paddw m6, m2
+%endif
+ paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
+ paddw m2, m0 ; a + {8,9,10,11,12,13,14,15}*H
+%if mmsize == 8
+ paddw m5, m0 ; a + {8,9,10,11}*H
+ paddw m6, m0 ; a + {12,13,14,15}*H
+%endif
+
+ mov r4, 8
+.loop:
+ mova m3, m0 ; b[0..7]
+ mova m4, m2 ; b[8..15]
+ psraw m3, 5
+ psraw m4, 5
+ packuswb m3, m4
+ mova [r0], m3
+%if mmsize == 8
+ mova m3, m5 ; b[8..11]
+ mova m4, m6 ; b[12..15]
+ psraw m3, 5
+ psraw m4, 5
+ packuswb m3, m4
+ mova [r0+8], m3
+%endif
+ paddw m0, m1
+ paddw m2, m1
+%if mmsize == 8
+ paddw m5, m1
+ paddw m6, m1
+%endif
+
+ mova m3, m0 ; b[0..7]
+ mova m4, m2 ; b[8..15]
+ psraw m3, 5
+ psraw m4, 5
+ packuswb m3, m4
+ mova [r0+r2], m3
+%if mmsize == 8
+ mova m3, m5 ; b[8..11]
+ mova m4, m6 ; b[12..15]
+ psraw m3, 5
+ psraw m4, 5
+ packuswb m3, m4
+ mova [r0+r2+8], m3
+%endif
+ paddw m0, m1
+ paddw m2, m1
+%if mmsize == 8
+ paddw m5, m1
+ paddw m6, m1
+%endif
+
+ lea r0, [r0+r2*2]
+ dec r4
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmx
+H264_PRED16x16_PLANE h264
+H264_PRED16x16_PLANE rv40
+H264_PRED16x16_PLANE svq3
+INIT_MMX mmxext
+H264_PRED16x16_PLANE h264
+H264_PRED16x16_PLANE rv40
+H264_PRED16x16_PLANE svq3
+INIT_XMM sse2
+H264_PRED16x16_PLANE h264
+H264_PRED16x16_PLANE rv40
+H264_PRED16x16_PLANE svq3
+INIT_XMM ssse3
+H264_PRED16x16_PLANE h264
+H264_PRED16x16_PLANE rv40
+H264_PRED16x16_PLANE svq3
+
+;-----------------------------------------------------------------------------
+; void ff_pred8x8_plane_8(uint8_t *src, ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+
+%macro H264_PRED8x8_PLANE 0
+cglobal pred8x8_plane_8, 2,9,7
+ mov r2, r1 ; +stride
+ neg r1 ; -stride
+
+ movd m0, [r0+r1 -1]
+%if mmsize == 8
+ pxor m2, m2
+ movh m1, [r0+r1 +4 ]
+ punpcklbw m0, m2
+ punpcklbw m1, m2
+ pmullw m0, [pw_m4to4]
+ pmullw m1, [pw_m4to4+8]
+%else ; mmsize == 16
+%if cpuflag(ssse3)
+ movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary
+ pmaddubsw m0, [plane8_shuf] ; H coefficients
+%else ; sse2
+ pxor m2, m2
+ movd m1, [r0+r1 +4]
+ punpckldq m0, m1
+ punpcklbw m0, m2
+ pmullw m0, [pw_m4to4]
+%endif
+ movhlps m1, m0
+%endif
+ paddw m0, m1
+
+%if notcpuflag(ssse3)
+%if cpuflag(mmxext)
+ PSHUFLW m1, m0, 0xE
+%elif cpuflag(mmx)
+ mova m1, m0
+ psrlq m1, 32
+%endif
+ paddw m0, m1
+%endif ; !ssse3
+
+%if cpuflag(mmxext)
+ PSHUFLW m1, m0, 0x1
+%elif cpuflag(mmx)
+ mova m1, m0
+ psrlq m1, 16
+%endif
+ paddw m0, m1 ; sum of H coefficients
+
+ lea r4, [r0+r2*4-1]
+ lea r3, [r0 -1]
+ add r4, r2
+
+%if ARCH_X86_64
+%define e_reg r8
+%else
+%define e_reg r0
+%endif
+
+ movzx e_reg, byte [r3+r2*2 ]
+ movzx r5, byte [r4+r1 ]
+ sub r5, e_reg
+
+ movzx e_reg, byte [r3 ]
+%if ARCH_X86_64
+ movzx r7, byte [r4+r2 ]
+ sub r7, e_reg
+ sub r5, r7
+%else
+ movzx r6, byte [r4+r2 ]
+ sub r6, e_reg
+ lea r5, [r5+r6*4]
+ sub r5, r6
+%endif
+
+ movzx e_reg, byte [r3+r1 ]
+ movzx r6, byte [r4+r2*2 ]
+ sub r6, e_reg
+%if ARCH_X86_64
+ add r6, r7
+%endif
+ lea r5, [r5+r6*4]
+
+ movzx e_reg, byte [r3+r2 ]
+ movzx r6, byte [r4 ]
+ sub r6, e_reg
+ lea r6, [r5+r6*2]
+
+ lea r5, [r6*9+16]
+ lea r5, [r5+r6*8]
+ sar r5, 5
+
+%if ARCH_X86_64 == 0
+ mov r0, r0m
+%endif
+
+ movzx r3, byte [r4+r2*2 ]
+ movzx r4, byte [r0+r1 +7]
+ lea r3, [r3+r4+1]
+ shl r3, 4
+ movd r1d, m0
+ movsx r1d, r1w
+ imul r1d, 17
+ add r1d, 16
+ sar r1d, 5
+ movd m0, r1d
+ add r1d, r5d
+ sub r3d, r1d
+ add r1d, r1d
+ sub r3d, r1d ; a
+
+ movd m1, r5d
+ movd m3, r3d
+ SPLATW m0, m0, 0 ; H
+ SPLATW m1, m1, 0 ; V
+ SPLATW m3, m3, 0 ; a
+%if mmsize == 8
+ mova m2, m0
+%endif
+ pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
+ paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
+%if mmsize == 8
+ psllw m2, 2
+ paddw m2, m0 ; a + {4,5,6,7}*H
+%endif
+
+ mov r4, 4
+ALIGN 16
+.loop:
+%if mmsize == 16
+ mova m3, m0 ; b[0..7]
+ paddw m0, m1
+ psraw m3, 5
+ mova m4, m0 ; V+b[0..7]
+ paddw m0, m1
+ psraw m4, 5
+ packuswb m3, m4
+ movh [r0], m3
+ movhps [r0+r2], m3
+%else ; mmsize == 8
+ mova m3, m0 ; b[0..3]
+ mova m4, m2 ; b[4..7]
+ paddw m0, m1
+ paddw m2, m1
+ psraw m3, 5
+ psraw m4, 5
+ mova m5, m0 ; V+b[0..3]
+ mova m6, m2 ; V+b[4..7]
+ paddw m0, m1
+ paddw m2, m1
+ psraw m5, 5
+ psraw m6, 5
+ packuswb m3, m4
+ packuswb m5, m6
+ mova [r0], m3
+ mova [r0+r2], m5
+%endif
+
+ lea r0, [r0+r2*2]
+ dec r4
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmx
+H264_PRED8x8_PLANE
+INIT_MMX mmxext
+H264_PRED8x8_PLANE
+INIT_XMM sse2
+H264_PRED8x8_PLANE
+INIT_XMM ssse3
+H264_PRED8x8_PLANE
+
+;-----------------------------------------------------------------------------
+; void ff_pred8x8_vertical_8(uint8_t *src, ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+
+INIT_MMX mmx
+cglobal pred8x8_vertical_8, 2,2
+ sub r0, r1
+ movq mm0, [r0]
+%rep 3
+ movq [r0+r1*1], mm0
+ movq [r0+r1*2], mm0
+ lea r0, [r0+r1*2]
+%endrep
+ movq [r0+r1*1], mm0
+ movq [r0+r1*2], mm0
+ RET
+
+;-----------------------------------------------------------------------------
+; void ff_pred8x8_horizontal_8(uint8_t *src, ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+
+%macro PRED8x8_H 0
+cglobal pred8x8_horizontal_8, 2,3
+ mov r2, 4
+%if cpuflag(ssse3)
+ mova m2, [pb_3]
+%endif
+.loop:
+ SPLATB_LOAD m0, r0+r1*0-1, m2
+ SPLATB_LOAD m1, r0+r1*1-1, m2
+ mova [r0+r1*0], m0
+ mova [r0+r1*1], m1
+ lea r0, [r0+r1*2]
+ dec r2
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmx
+PRED8x8_H
+INIT_MMX mmxext
+PRED8x8_H
+INIT_MMX ssse3
+PRED8x8_H
+
+;-----------------------------------------------------------------------------
+; void ff_pred8x8_top_dc_8_mmxext(uint8_t *src, ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+INIT_MMX mmxext
+cglobal pred8x8_top_dc_8, 2,5
+ sub r0, r1
+ movq mm0, [r0]
+ pxor mm1, mm1
+ pxor mm2, mm2
+ lea r2, [r0+r1*2]
+ punpckhbw mm1, mm0
+ punpcklbw mm0, mm2
+ psadbw mm1, mm2 ; s1
+ lea r3, [r2+r1*2]
+ psadbw mm0, mm2 ; s0
+ psrlw mm1, 1
+ psrlw mm0, 1
+ pavgw mm1, mm2
+ lea r4, [r3+r1*2]
+ pavgw mm0, mm2
+ pshufw mm1, mm1, 0
+ pshufw mm0, mm0, 0 ; dc0 (w)
+ packuswb mm0, mm1 ; dc0,dc1 (b)
+ movq [r0+r1*1], mm0
+ movq [r0+r1*2], mm0
+ lea r0, [r3+r1*2]
+ movq [r2+r1*1], mm0
+ movq [r2+r1*2], mm0
+ movq [r3+r1*1], mm0
+ movq [r3+r1*2], mm0
+ movq [r0+r1*1], mm0
+ movq [r0+r1*2], mm0
+ RET
+
+;-----------------------------------------------------------------------------
+; void ff_pred8x8_dc_8_mmxext(uint8_t *src, ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+
+INIT_MMX mmxext
+cglobal pred8x8_dc_8, 2,5
+ sub r0, r1
+ pxor m7, m7
+ movd m0, [r0+0]
+ movd m1, [r0+4]
+ psadbw m0, m7 ; s0
+ mov r4, r0
+ psadbw m1, m7 ; s1
+
+ movzx r2d, byte [r0+r1*1-1]
+ movzx r3d, byte [r0+r1*2-1]
+ lea r0, [r0+r1*2]
+ add r2d, r3d
+ movzx r3d, byte [r0+r1*1-1]
+ add r2d, r3d
+ movzx r3d, byte [r0+r1*2-1]
+ add r2d, r3d
+ lea r0, [r0+r1*2]
+ movd m2, r2d ; s2
+ movzx r2d, byte [r0+r1*1-1]
+ movzx r3d, byte [r0+r1*2-1]
+ lea r0, [r0+r1*2]
+ add r2d, r3d
+ movzx r3d, byte [r0+r1*1-1]
+ add r2d, r3d
+ movzx r3d, byte [r0+r1*2-1]
+ add r2d, r3d
+ movd m3, r2d ; s3
+
+ punpcklwd m0, m1
+ mov r0, r4
+ punpcklwd m2, m3
+ punpckldq m0, m2 ; s0, s1, s2, s3
+ pshufw m3, m0, 11110110b ; s2, s1, s3, s3
+ lea r2, [r0+r1*2]
+ pshufw m0, m0, 01110100b ; s0, s1, s3, s1
+ paddw m0, m3
+ lea r3, [r2+r1*2]
+ psrlw m0, 2
+ pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
+ lea r4, [r3+r1*2]
+ packuswb m0, m0
+ punpcklbw m0, m0
+ movq m1, m0
+ punpcklbw m0, m0
+ punpckhbw m1, m1
+ movq [r0+r1*1], m0
+ movq [r0+r1*2], m0
+ movq [r2+r1*1], m0
+ movq [r2+r1*2], m0
+ movq [r3+r1*1], m1
+ movq [r3+r1*2], m1
+ movq [r4+r1*1], m1
+ movq [r4+r1*2], m1
+ RET
+
+;-----------------------------------------------------------------------------
+; void ff_pred8x8_dc_rv40_8(uint8_t *src, ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+
+INIT_MMX mmxext
+cglobal pred8x8_dc_rv40_8, 2,7
+ mov r4, r0
+ sub r0, r1
+ pxor mm0, mm0
+ psadbw mm0, [r0]
+ dec r0
+ movzx r5d, byte [r0+r1*1]
+ movd r6d, mm0
+ lea r0, [r0+r1*2]
+%rep 3
+ movzx r2d, byte [r0+r1*0]
+ movzx r3d, byte [r0+r1*1]
+ add r5d, r2d
+ add r6d, r3d
+ lea r0, [r0+r1*2]
+%endrep
+ movzx r2d, byte [r0+r1*0]
+ add r5d, r6d
+ lea r2d, [r2+r5+8]
+ shr r2d, 4
+ movd mm0, r2d
+ punpcklbw mm0, mm0
+ pshufw mm0, mm0, 0
+ mov r3d, 4
+.loop:
+ movq [r4+r1*0], mm0
+ movq [r4+r1*1], mm0
+ lea r4, [r4+r1*2]
+ dec r3d
+ jg .loop
+ REP_RET
+
+;-----------------------------------------------------------------------------
+; void ff_pred8x8_tm_vp8_8(uint8_t *src, ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+
+%macro PRED8x8_TM 0
+cglobal pred8x8_tm_vp8_8, 2,6
+ sub r0, r1
+ pxor mm7, mm7
+ movq mm0, [r0]
+ movq mm1, mm0
+ punpcklbw mm0, mm7
+ punpckhbw mm1, mm7
+ movzx r4d, byte [r0-1]
+ mov r5d, 4
+.loop:
+ movzx r2d, byte [r0+r1*1-1]
+ movzx r3d, byte [r0+r1*2-1]
+ sub r2d, r4d
+ sub r3d, r4d
+ movd mm2, r2d
+ movd mm4, r3d
+ SPLATW mm2, mm2, 0
+ SPLATW mm4, mm4, 0
+ movq mm3, mm2
+ movq mm5, mm4
+ paddw mm2, mm0
+ paddw mm3, mm1
+ paddw mm4, mm0
+ paddw mm5, mm1
+ packuswb mm2, mm3
+ packuswb mm4, mm5
+ movq [r0+r1*1], mm2
+ movq [r0+r1*2], mm4
+ lea r0, [r0+r1*2]
+ dec r5d
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmx
+PRED8x8_TM
+INIT_MMX mmxext
+PRED8x8_TM
+
+INIT_XMM sse2
+cglobal pred8x8_tm_vp8_8, 2,6,4
+ sub r0, r1
+ pxor xmm1, xmm1
+ movq xmm0, [r0]
+ punpcklbw xmm0, xmm1
+ movzx r4d, byte [r0-1]
+ mov r5d, 4
+.loop:
+ movzx r2d, byte [r0+r1*1-1]
+ movzx r3d, byte [r0+r1*2-1]
+ sub r2d, r4d
+ sub r3d, r4d
+ movd xmm2, r2d
+ movd xmm3, r3d
+ pshuflw xmm2, xmm2, 0
+ pshuflw xmm3, xmm3, 0
+ punpcklqdq xmm2, xmm2
+ punpcklqdq xmm3, xmm3
+ paddw xmm2, xmm0
+ paddw xmm3, xmm0
+ packuswb xmm2, xmm3
+ movq [r0+r1*1], xmm2
+ movhps [r0+r1*2], xmm2
+ lea r0, [r0+r1*2]
+ dec r5d
+ jg .loop
+ REP_RET
+
+INIT_XMM ssse3
+cglobal pred8x8_tm_vp8_8, 2,3,6
+ sub r0, r1
+ movdqa xmm4, [tm_shuf]
+ pxor xmm1, xmm1
+ movq xmm0, [r0]
+ punpcklbw xmm0, xmm1
+ movd xmm5, [r0-4]
+ pshufb xmm5, xmm4
+ mov r2d, 4
+.loop:
+ movd xmm2, [r0+r1*1-4]
+ movd xmm3, [r0+r1*2-4]
+ pshufb xmm2, xmm4
+ pshufb xmm3, xmm4
+ psubw xmm2, xmm5
+ psubw xmm3, xmm5
+ paddw xmm2, xmm0
+ paddw xmm3, xmm0
+ packuswb xmm2, xmm3
+ movq [r0+r1*1], xmm2
+ movhps [r0+r1*2], xmm2
+ lea r0, [r0+r1*2]
+ dec r2d
+ jg .loop
+ REP_RET
+
+; dest, left, right, src, tmp
+; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
+%macro PRED4x4_LOWPASS 5
+ mova %5, %2
+ pavgb %2, %3
+ pxor %3, %5
+ mova %1, %4
+ pand %3, [pb_1]
+ psubusb %2, %3
+ pavgb %1, %2
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void ff_pred8x8l_top_dc_8(uint8_t *src, int has_topleft, int has_topright,
+; ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+%macro PRED8x8L_TOP_DC 0
+cglobal pred8x8l_top_dc_8, 4,4
+ sub r0, r3
+ pxor mm7, mm7
+ movq mm0, [r0-8]
+ movq mm3, [r0]
+ movq mm1, [r0+8]
+ movq mm2, mm3
+ movq mm4, mm3
+ PALIGNR mm2, mm0, 7, mm0
+ PALIGNR mm1, mm4, 1, mm4
+ test r1d, r1d ; top_left
+ jz .fix_lt_2
+ test r2d, r2d ; top_right
+ jz .fix_tr_1
+ jmp .body
+.fix_lt_2:
+ movq mm5, mm3
+ pxor mm5, mm2
+ psllq mm5, 56
+ psrlq mm5, 56
+ pxor mm2, mm5
+ test r2d, r2d ; top_right
+ jnz .body
+.fix_tr_1:
+ movq mm5, mm3
+ pxor mm5, mm1
+ psrlq mm5, 56
+ psllq mm5, 56
+ pxor mm1, mm5
+.body:
+ PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
+ psadbw mm7, mm0
+ paddw mm7, [pw_4]
+ psrlw mm7, 3
+ pshufw mm7, mm7, 0
+ packuswb mm7, mm7
+%rep 3
+ movq [r0+r3*1], mm7
+ movq [r0+r3*2], mm7
+ lea r0, [r0+r3*2]
+%endrep
+ movq [r0+r3*1], mm7
+ movq [r0+r3*2], mm7
+ RET
+%endmacro
+
+INIT_MMX mmxext
+PRED8x8L_TOP_DC
+INIT_MMX ssse3
+PRED8x8L_TOP_DC
+
+;-----------------------------------------------------------------------------
+; void ff_pred8x8l_dc_8(uint8_t *src, int has_topleft, int has_topright,
+; ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+
+%macro PRED8x8L_DC 0
+cglobal pred8x8l_dc_8, 4,5
+ sub r0, r3
+ lea r4, [r0+r3*2]
+ movq mm0, [r0+r3*1-8]
+ punpckhbw mm0, [r0+r3*0-8]
+ movq mm1, [r4+r3*1-8]
+ punpckhbw mm1, [r0+r3*2-8]
+ mov r4, r0
+ punpckhwd mm1, mm0
+ lea r0, [r0+r3*4]
+ movq mm2, [r0+r3*1-8]
+ punpckhbw mm2, [r0+r3*0-8]
+ lea r0, [r0+r3*2]
+ movq mm3, [r0+r3*1-8]
+ punpckhbw mm3, [r0+r3*0-8]
+ punpckhwd mm3, mm2
+ punpckhdq mm3, mm1
+ lea r0, [r0+r3*2]
+ movq mm0, [r0+r3*0-8]
+ movq mm1, [r4]
+ mov r0, r4
+ movq mm4, mm3
+ movq mm2, mm3
+ PALIGNR mm4, mm0, 7, mm0
+ PALIGNR mm1, mm2, 1, mm2
+ test r1d, r1d
+ jnz .do_left
+.fix_lt_1:
+ movq mm5, mm3
+ pxor mm5, mm4
+ psrlq mm5, 56
+ psllq mm5, 48
+ pxor mm1, mm5
+ jmp .do_left
+.fix_lt_2:
+ movq mm5, mm3
+ pxor mm5, mm2
+ psllq mm5, 56
+ psrlq mm5, 56
+ pxor mm2, mm5
+ test r2d, r2d
+ jnz .body
+.fix_tr_1:
+ movq mm5, mm3
+ pxor mm5, mm1
+ psrlq mm5, 56
+ psllq mm5, 56
+ pxor mm1, mm5
+ jmp .body
+.do_left:
+ movq mm0, mm4
+ PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
+ movq mm4, mm0
+ movq mm7, mm2
+ PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
+ psllq mm1, 56
+ PALIGNR mm7, mm1, 7, mm3
+ movq mm0, [r0-8]
+ movq mm3, [r0]
+ movq mm1, [r0+8]
+ movq mm2, mm3
+ movq mm4, mm3
+ PALIGNR mm2, mm0, 7, mm0
+ PALIGNR mm1, mm4, 1, mm4
+ test r1d, r1d
+ jz .fix_lt_2
+ test r2d, r2d
+ jz .fix_tr_1
+.body:
+ lea r1, [r0+r3*2]
+ PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
+ pxor mm0, mm0
+ pxor mm1, mm1
+ lea r2, [r1+r3*2]
+ psadbw mm0, mm7
+ psadbw mm1, mm6
+ paddw mm0, [pw_8]
+ paddw mm0, mm1
+ lea r4, [r2+r3*2]
+ psrlw mm0, 4
+ pshufw mm0, mm0, 0
+ packuswb mm0, mm0
+ movq [r0+r3*1], mm0
+ movq [r0+r3*2], mm0
+ movq [r1+r3*1], mm0
+ movq [r1+r3*2], mm0
+ movq [r2+r3*1], mm0
+ movq [r2+r3*2], mm0
+ movq [r4+r3*1], mm0
+ movq [r4+r3*2], mm0
+ RET
+%endmacro
+
+INIT_MMX mmxext
+PRED8x8L_DC
+INIT_MMX ssse3
+PRED8x8L_DC
+
+;-----------------------------------------------------------------------------
+; void ff_pred8x8l_horizontal_8(uint8_t *src, int has_topleft,
+; int has_topright, ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+
+%macro PRED8x8L_HORIZONTAL 0
+cglobal pred8x8l_horizontal_8, 4,4
+ sub r0, r3
+ lea r2, [r0+r3*2]
+ movq mm0, [r0+r3*1-8]
+ test r1d, r1d
+ lea r1, [r0+r3]
+ cmovnz r1, r0
+ punpckhbw mm0, [r1+r3*0-8]
+ movq mm1, [r2+r3*1-8]
+ punpckhbw mm1, [r0+r3*2-8]
+ mov r2, r0
+ punpckhwd mm1, mm0
+ lea r0, [r0+r3*4]
+ movq mm2, [r0+r3*1-8]
+ punpckhbw mm2, [r0+r3*0-8]
+ lea r0, [r0+r3*2]
+ movq mm3, [r0+r3*1-8]
+ punpckhbw mm3, [r0+r3*0-8]
+ punpckhwd mm3, mm2
+ punpckhdq mm3, mm1
+ lea r0, [r0+r3*2]
+ movq mm0, [r0+r3*0-8]
+ movq mm1, [r1+r3*0-8]
+ mov r0, r2
+ movq mm4, mm3
+ movq mm2, mm3
+ PALIGNR mm4, mm0, 7, mm0
+ PALIGNR mm1, mm2, 1, mm2
+ movq mm0, mm4
+ PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
+ movq mm4, mm0
+ movq mm7, mm2
+ PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
+ psllq mm1, 56
+ PALIGNR mm7, mm1, 7, mm3
+ movq mm3, mm7
+ lea r1, [r0+r3*2]
+ movq mm7, mm3
+ punpckhbw mm3, mm3
+ punpcklbw mm7, mm7
+ pshufw mm0, mm3, 0xff
+ pshufw mm1, mm3, 0xaa
+ lea r2, [r1+r3*2]
+ pshufw mm2, mm3, 0x55
+ pshufw mm3, mm3, 0x00
+ pshufw mm4, mm7, 0xff
+ pshufw mm5, mm7, 0xaa
+ pshufw mm6, mm7, 0x55
+ pshufw mm7, mm7, 0x00
+ movq [r0+r3*1], mm0
+ movq [r0+r3*2], mm1
+ movq [r1+r3*1], mm2
+ movq [r1+r3*2], mm3
+ movq [r2+r3*1], mm4
+ movq [r2+r3*2], mm5
+ lea r0, [r2+r3*2]
+ movq [r0+r3*1], mm6
+ movq [r0+r3*2], mm7
+ RET
+%endmacro
+
+INIT_MMX mmxext
+PRED8x8L_HORIZONTAL
+INIT_MMX ssse3
+PRED8x8L_HORIZONTAL
+
+;-----------------------------------------------------------------------------
+; void ff_pred8x8l_vertical_8(uint8_t *src, int has_topleft, int has_topright,
+; ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+
+%macro PRED8x8L_VERTICAL 0
+cglobal pred8x8l_vertical_8, 4,4
+ sub r0, r3
+ movq mm0, [r0-8]
+ movq mm3, [r0]
+ movq mm1, [r0+8]
+ movq mm2, mm3
+ movq mm4, mm3
+ PALIGNR mm2, mm0, 7, mm0
+ PALIGNR mm1, mm4, 1, mm4
+ test r1d, r1d ; top_left
+ jz .fix_lt_2
+ test r2d, r2d ; top_right
+ jz .fix_tr_1
+ jmp .body
+.fix_lt_2:
+ movq mm5, mm3
+ pxor mm5, mm2
+ psllq mm5, 56
+ psrlq mm5, 56
+ pxor mm2, mm5
+ test r2d, r2d ; top_right
+ jnz .body
+.fix_tr_1:
+ movq mm5, mm3
+ pxor mm5, mm1
+ psrlq mm5, 56
+ psllq mm5, 56
+ pxor mm1, mm5
+.body:
+ PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
+%rep 3
+ movq [r0+r3*1], mm0
+ movq [r0+r3*2], mm0
+ lea r0, [r0+r3*2]
+%endrep
+ movq [r0+r3*1], mm0
+ movq [r0+r3*2], mm0
+ RET
+%endmacro
+
+INIT_MMX mmxext
+PRED8x8L_VERTICAL
+INIT_MMX ssse3
+PRED8x8L_VERTICAL
+
+;-----------------------------------------------------------------------------
+; void ff_pred8x8l_down_left_8(uint8_t *src, int has_topleft,
+; int has_topright, ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+
+INIT_MMX mmxext
+cglobal pred8x8l_down_left_8, 4,5
+ sub r0, r3
+ movq mm0, [r0-8]
+ movq mm3, [r0]
+ movq mm1, [r0+8]
+ movq mm2, mm3
+ movq mm4, mm3
+ PALIGNR mm2, mm0, 7, mm0
+ PALIGNR mm1, mm4, 1, mm4
+ test r1d, r1d
+ jz .fix_lt_2
+ test r2d, r2d
+ jz .fix_tr_1
+ jmp .do_top
+.fix_lt_2:
+ movq mm5, mm3
+ pxor mm5, mm2
+ psllq mm5, 56
+ psrlq mm5, 56
+ pxor mm2, mm5
+ test r2d, r2d
+ jnz .do_top
+.fix_tr_1:
+ movq mm5, mm3
+ pxor mm5, mm1
+ psrlq mm5, 56
+ psllq mm5, 56
+ pxor mm1, mm5
+ jmp .do_top
+.fix_tr_2:
+ punpckhbw mm3, mm3
+ pshufw mm1, mm3, 0xFF
+ jmp .do_topright
+.do_top:
+ PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
+ movq mm7, mm4
+ test r2d, r2d
+ jz .fix_tr_2
+ movq mm0, [r0+8]
+ movq mm5, mm0
+ movq mm2, mm0
+ movq mm4, mm0
+ psrlq mm5, 56
+ PALIGNR mm2, mm3, 7, mm3
+ PALIGNR mm5, mm4, 1, mm4
+ PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
+.do_topright:
+ lea r1, [r0+r3*2]
+ movq mm6, mm1
+ psrlq mm1, 56
+ movq mm4, mm1
+ lea r2, [r1+r3*2]
+ movq mm2, mm6
+ PALIGNR mm2, mm7, 1, mm0
+ movq mm3, mm6
+ PALIGNR mm3, mm7, 7, mm0
+ PALIGNR mm4, mm6, 1, mm0
+ movq mm5, mm7
+ movq mm1, mm7
+ movq mm7, mm6
+ lea r4, [r2+r3*2]
+ psllq mm1, 8
+ PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
+ PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
+ movq [r4+r3*2], mm1
+ movq mm2, mm0
+ psllq mm1, 8
+ psrlq mm2, 56
+ psllq mm0, 8
+ por mm1, mm2
+ movq [r4+r3*1], mm1
+ movq mm2, mm0
+ psllq mm1, 8
+ psrlq mm2, 56
+ psllq mm0, 8
+ por mm1, mm2
+ movq [r2+r3*2], mm1
+ movq mm2, mm0
+ psllq mm1, 8
+ psrlq mm2, 56
+ psllq mm0, 8
+ por mm1, mm2
+ movq [r2+r3*1], mm1
+ movq mm2, mm0
+ psllq mm1, 8
+ psrlq mm2, 56
+ psllq mm0, 8
+ por mm1, mm2
+ movq [r1+r3*2], mm1
+ movq mm2, mm0
+ psllq mm1, 8
+ psrlq mm2, 56
+ psllq mm0, 8
+ por mm1, mm2
+ movq [r1+r3*1], mm1
+ movq mm2, mm0
+ psllq mm1, 8
+ psrlq mm2, 56
+ psllq mm0, 8
+ por mm1, mm2
+ movq [r0+r3*2], mm1
+ psllq mm1, 8
+ psrlq mm0, 56
+ por mm1, mm0
+ movq [r0+r3*1], mm1
+ RET
+
+%macro PRED8x8L_DOWN_LEFT 0
+cglobal pred8x8l_down_left_8, 4,4
+ sub r0, r3
+ movq mm0, [r0-8]
+ movq mm3, [r0]
+ movq mm1, [r0+8]
+ movq mm2, mm3
+ movq mm4, mm3
+ PALIGNR mm2, mm0, 7, mm0
+ PALIGNR mm1, mm4, 1, mm4
+ test r1d, r1d ; top_left
+ jz .fix_lt_2
+ test r2d, r2d ; top_right
+ jz .fix_tr_1
+ jmp .do_top
+.fix_lt_2:
+ movq mm5, mm3
+ pxor mm5, mm2
+ psllq mm5, 56
+ psrlq mm5, 56
+ pxor mm2, mm5
+ test r2d, r2d ; top_right
+ jnz .do_top
+.fix_tr_1:
+ movq mm5, mm3
+ pxor mm5, mm1
+ psrlq mm5, 56
+ psllq mm5, 56
+ pxor mm1, mm5
+ jmp .do_top
+.fix_tr_2:
+ punpckhbw mm3, mm3
+ pshufw mm1, mm3, 0xFF
+ jmp .do_topright
+.do_top:
+ PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
+ movq2dq xmm3, mm4
+ test r2d, r2d ; top_right
+ jz .fix_tr_2
+ movq mm0, [r0+8]
+ movq mm5, mm0
+ movq mm2, mm0
+ movq mm4, mm0
+ psrlq mm5, 56
+ PALIGNR mm2, mm3, 7, mm3
+ PALIGNR mm5, mm4, 1, mm4
+ PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
+.do_topright:
+ movq2dq xmm4, mm1
+ psrlq mm1, 56
+ movq2dq xmm5, mm1
+ lea r1, [r0+r3*2]
+ pslldq xmm4, 8
+ por xmm3, xmm4
+ movdqa xmm2, xmm3
+ psrldq xmm2, 1
+ pslldq xmm5, 15
+ por xmm2, xmm5
+ lea r2, [r1+r3*2]
+ movdqa xmm1, xmm3
+ pslldq xmm1, 1
+INIT_XMM cpuname
+ PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
+ psrldq xmm0, 1
+ movq [r0+r3*1], xmm0
+ psrldq xmm0, 1
+ movq [r0+r3*2], xmm0
+ psrldq xmm0, 1
+ lea r0, [r2+r3*2]
+ movq [r1+r3*1], xmm0
+ psrldq xmm0, 1
+ movq [r1+r3*2], xmm0
+ psrldq xmm0, 1
+ movq [r2+r3*1], xmm0
+ psrldq xmm0, 1
+ movq [r2+r3*2], xmm0
+ psrldq xmm0, 1
+ movq [r0+r3*1], xmm0
+ psrldq xmm0, 1
+ movq [r0+r3*2], xmm0
+ RET
+%endmacro
+
+INIT_MMX sse2
+PRED8x8L_DOWN_LEFT
+INIT_MMX ssse3
+PRED8x8L_DOWN_LEFT
+
+;-----------------------------------------------------------------------------
+; void ff_pred8x8l_down_right_8_mmxext(uint8_t *src, int has_topleft,
+; int has_topright, ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+
+INIT_MMX mmxext
+cglobal pred8x8l_down_right_8, 4,5
+ sub r0, r3
+ lea r4, [r0+r3*2]
+ movq mm0, [r0+r3*1-8]
+ punpckhbw mm0, [r0+r3*0-8]
+ movq mm1, [r4+r3*1-8]
+ punpckhbw mm1, [r0+r3*2-8]
+ mov r4, r0
+ punpckhwd mm1, mm0
+ lea r0, [r0+r3*4]
+ movq mm2, [r0+r3*1-8]
+ punpckhbw mm2, [r0+r3*0-8]
+ lea r0, [r0+r3*2]
+ movq mm3, [r0+r3*1-8]
+ punpckhbw mm3, [r0+r3*0-8]
+ punpckhwd mm3, mm2
+ punpckhdq mm3, mm1
+ lea r0, [r0+r3*2]
+ movq mm0, [r0+r3*0-8]
+ movq mm1, [r4]
+ mov r0, r4
+ movq mm4, mm3
+ movq mm2, mm3
+ PALIGNR mm4, mm0, 7, mm0
+ PALIGNR mm1, mm2, 1, mm2
+ test r1d, r1d ; top_left
+ jz .fix_lt_1
+.do_left:
+ movq mm0, mm4
+ PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
+ movq mm4, mm0
+ movq mm7, mm2
+ movq mm6, mm2
+ PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
+ psllq mm1, 56
+ PALIGNR mm7, mm1, 7, mm3
+ movq mm0, [r0-8]
+ movq mm3, [r0]
+ movq mm1, [r0+8]
+ movq mm2, mm3
+ movq mm4, mm3
+ PALIGNR mm2, mm0, 7, mm0
+ PALIGNR mm1, mm4, 1, mm4
+ test r1d, r1d ; top_left
+ jz .fix_lt_2
+ test r2d, r2d ; top_right
+ jz .fix_tr_1
+.do_top:
+ PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
+ movq mm5, mm4
+ jmp .body
+.fix_lt_1:
+ movq mm5, mm3
+ pxor mm5, mm4
+ psrlq mm5, 56
+ psllq mm5, 48
+ pxor mm1, mm5
+ jmp .do_left
+.fix_lt_2:
+ movq mm5, mm3
+ pxor mm5, mm2
+ psllq mm5, 56
+ psrlq mm5, 56
+ pxor mm2, mm5
+ test r2d, r2d ; top_right
+ jnz .do_top
+.fix_tr_1:
+ movq mm5, mm3
+ pxor mm5, mm1
+ psrlq mm5, 56
+ psllq mm5, 56
+ pxor mm1, mm5
+ jmp .do_top
+.body:
+ lea r1, [r0+r3*2]
+ movq mm1, mm7
+ movq mm7, mm5
+ movq mm5, mm6
+ movq mm2, mm7
+ lea r2, [r1+r3*2]
+ PALIGNR mm2, mm6, 1, mm0
+ movq mm3, mm7
+ PALIGNR mm3, mm6, 7, mm0
+ movq mm4, mm7
+ lea r4, [r2+r3*2]
+ psrlq mm4, 8
+ PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
+ PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
+ movq [r4+r3*2], mm0
+ movq mm2, mm1
+ psrlq mm0, 8
+ psllq mm2, 56
+ psrlq mm1, 8
+ por mm0, mm2
+ movq [r4+r3*1], mm0
+ movq mm2, mm1
+ psrlq mm0, 8
+ psllq mm2, 56
+ psrlq mm1, 8
+ por mm0, mm2
+ movq [r2+r3*2], mm0
+ movq mm2, mm1
+ psrlq mm0, 8
+ psllq mm2, 56
+ psrlq mm1, 8
+ por mm0, mm2
+ movq [r2+r3*1], mm0
+ movq mm2, mm1
+ psrlq mm0, 8
+ psllq mm2, 56
+ psrlq mm1, 8
+ por mm0, mm2
+ movq [r1+r3*2], mm0
+ movq mm2, mm1
+ psrlq mm0, 8
+ psllq mm2, 56
+ psrlq mm1, 8
+ por mm0, mm2
+ movq [r1+r3*1], mm0
+ movq mm2, mm1
+ psrlq mm0, 8
+ psllq mm2, 56
+ psrlq mm1, 8
+ por mm0, mm2
+ movq [r0+r3*2], mm0
+ psrlq mm0, 8
+ psllq mm1, 56
+ por mm0, mm1
+ movq [r0+r3*1], mm0
+ RET
+
+%macro PRED8x8L_DOWN_RIGHT 0
+cglobal pred8x8l_down_right_8, 4,5
+ sub r0, r3
+ lea r4, [r0+r3*2]
+ movq mm0, [r0+r3*1-8]
+ punpckhbw mm0, [r0+r3*0-8]
+ movq mm1, [r4+r3*1-8]
+ punpckhbw mm1, [r0+r3*2-8]
+ mov r4, r0
+ punpckhwd mm1, mm0
+ lea r0, [r0+r3*4]
+ movq mm2, [r0+r3*1-8]
+ punpckhbw mm2, [r0+r3*0-8]
+ lea r0, [r0+r3*2]
+ movq mm3, [r0+r3*1-8]
+ punpckhbw mm3, [r0+r3*0-8]
+ punpckhwd mm3, mm2
+ punpckhdq mm3, mm1
+ lea r0, [r0+r3*2]
+ movq mm0, [r0+r3*0-8]
+ movq mm1, [r4]
+ mov r0, r4
+ movq mm4, mm3
+ movq mm2, mm3
+ PALIGNR mm4, mm0, 7, mm0
+ PALIGNR mm1, mm2, 1, mm2
+ test r1d, r1d
+ jz .fix_lt_1
+ jmp .do_left
+.fix_lt_1:
+ movq mm5, mm3
+ pxor mm5, mm4
+ psrlq mm5, 56
+ psllq mm5, 48
+ pxor mm1, mm5
+ jmp .do_left
+.fix_lt_2:
+ movq mm5, mm3
+ pxor mm5, mm2
+ psllq mm5, 56
+ psrlq mm5, 56
+ pxor mm2, mm5
+ test r2d, r2d
+ jnz .do_top
+.fix_tr_1:
+ movq mm5, mm3
+ pxor mm5, mm1
+ psrlq mm5, 56
+ psllq mm5, 56
+ pxor mm1, mm5
+ jmp .do_top
+.do_left:
+ movq mm0, mm4
+ PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
+ movq mm4, mm0
+ movq mm7, mm2
+ movq2dq xmm3, mm2
+ PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
+ psllq mm1, 56
+ PALIGNR mm7, mm1, 7, mm3
+ movq2dq xmm1, mm7
+ movq mm0, [r0-8]
+ movq mm3, [r0]
+ movq mm1, [r0+8]
+ movq mm2, mm3
+ movq mm4, mm3
+ PALIGNR mm2, mm0, 7, mm0
+ PALIGNR mm1, mm4, 1, mm4
+ test r1d, r1d
+ jz .fix_lt_2
+ test r2d, r2d
+ jz .fix_tr_1
+.do_top:
+ PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
+ movq2dq xmm4, mm4
+ lea r1, [r0+r3*2]
+ movdqa xmm0, xmm3
+ pslldq xmm4, 8
+ por xmm3, xmm4
+ lea r2, [r1+r3*2]
+ pslldq xmm4, 1
+ por xmm1, xmm4
+ psrldq xmm0, 7
+ pslldq xmm0, 15
+ psrldq xmm0, 7
+ por xmm1, xmm0
+ lea r0, [r2+r3*2]
+ movdqa xmm2, xmm3
+ psrldq xmm2, 1
+INIT_XMM cpuname
+ PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+ movq [r0+r3*2], xmm0
+ movq [r0+r3*1], xmm1
+ psrldq xmm0, 2
+ psrldq xmm1, 2
+ movq [r2+r3*2], xmm0
+ movq [r2+r3*1], xmm1
+ psrldq xmm0, 2
+ psrldq xmm1, 2
+ movq [r1+r3*2], xmm0
+ movq [r1+r3*1], xmm1
+ psrldq xmm0, 2
+ psrldq xmm1, 2
+ movq [r4+r3*2], xmm0
+ movq [r4+r3*1], xmm1
+ RET
+%endmacro
+
+INIT_MMX sse2
+PRED8x8L_DOWN_RIGHT
+INIT_MMX ssse3
+PRED8x8L_DOWN_RIGHT
+
+;-----------------------------------------------------------------------------
+; void ff_pred8x8l_vertical_right_8(uint8_t *src, int has_topleft,
+; int has_topright, ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+
+INIT_MMX mmxext
+cglobal pred8x8l_vertical_right_8, 4,5
+ sub r0, r3
+ lea r4, [r0+r3*2]
+ movq mm0, [r0+r3*1-8]
+ punpckhbw mm0, [r0+r3*0-8]
+ movq mm1, [r4+r3*1-8]
+ punpckhbw mm1, [r0+r3*2-8]
+ mov r4, r0
+ punpckhwd mm1, mm0
+ lea r0, [r0+r3*4]
+ movq mm2, [r0+r3*1-8]
+ punpckhbw mm2, [r0+r3*0-8]
+ lea r0, [r0+r3*2]
+ movq mm3, [r0+r3*1-8]
+ punpckhbw mm3, [r0+r3*0-8]
+ punpckhwd mm3, mm2
+ punpckhdq mm3, mm1
+ lea r0, [r0+r3*2]
+ movq mm0, [r0+r3*0-8]
+ movq mm1, [r4]
+ mov r0, r4
+ movq mm4, mm3
+ movq mm2, mm3
+ PALIGNR mm4, mm0, 7, mm0
+ PALIGNR mm1, mm2, 1, mm2
+ test r1d, r1d
+ jz .fix_lt_1
+ jmp .do_left
+.fix_lt_1:
+ movq mm5, mm3
+ pxor mm5, mm4
+ psrlq mm5, 56
+ psllq mm5, 48
+ pxor mm1, mm5
+ jmp .do_left
+.fix_lt_2:
+ movq mm5, mm3
+ pxor mm5, mm2
+ psllq mm5, 56
+ psrlq mm5, 56
+ pxor mm2, mm5
+ test r2d, r2d
+ jnz .do_top
+.fix_tr_1:
+ movq mm5, mm3
+ pxor mm5, mm1
+ psrlq mm5, 56
+ psllq mm5, 56
+ pxor mm1, mm5
+ jmp .do_top
+.do_left:
+ movq mm0, mm4
+ PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
+ movq mm7, mm2
+ movq mm0, [r0-8]
+ movq mm3, [r0]
+ movq mm1, [r0+8]
+ movq mm2, mm3
+ movq mm4, mm3
+ PALIGNR mm2, mm0, 7, mm0
+ PALIGNR mm1, mm4, 1, mm4
+ test r1d, r1d
+ jz .fix_lt_2
+ test r2d, r2d
+ jz .fix_tr_1
+.do_top:
+ PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
+ lea r1, [r0+r3*2]
+ movq mm2, mm6
+ movq mm3, mm6
+ PALIGNR mm3, mm7, 7, mm0
+ PALIGNR mm6, mm7, 6, mm1
+ movq mm4, mm3
+ pavgb mm3, mm2
+ lea r2, [r1+r3*2]
+ PRED4x4_LOWPASS mm0, mm6, mm2, mm4, mm5
+ movq [r0+r3*1], mm3
+ movq [r0+r3*2], mm0
+ movq mm5, mm0
+ movq mm6, mm3
+ movq mm1, mm7
+ movq mm2, mm1
+ psllq mm2, 8
+ movq mm3, mm1
+ psllq mm3, 16
+ lea r4, [r2+r3*2]
+ PRED4x4_LOWPASS mm0, mm1, mm3, mm2, mm4
+ PALIGNR mm6, mm0, 7, mm2
+ movq [r1+r3*1], mm6
+ psllq mm0, 8
+ PALIGNR mm5, mm0, 7, mm1
+ movq [r1+r3*2], mm5
+ psllq mm0, 8
+ PALIGNR mm6, mm0, 7, mm2
+ movq [r2+r3*1], mm6
+ psllq mm0, 8
+ PALIGNR mm5, mm0, 7, mm1
+ movq [r2+r3*2], mm5
+ psllq mm0, 8
+ PALIGNR mm6, mm0, 7, mm2
+ movq [r4+r3*1], mm6
+ psllq mm0, 8
+ PALIGNR mm5, mm0, 7, mm1
+ movq [r4+r3*2], mm5
+ RET
+
+%macro PRED8x8L_VERTICAL_RIGHT 0
+cglobal pred8x8l_vertical_right_8, 4,5,7
+ ; manually spill XMM registers for Win64 because
+ ; the code here is initialized with INIT_MMX
+ WIN64_SPILL_XMM 7
+ sub r0, r3
+ lea r4, [r0+r3*2]
+ movq mm0, [r0+r3*1-8]
+ punpckhbw mm0, [r0+r3*0-8]
+ movq mm1, [r4+r3*1-8]
+ punpckhbw mm1, [r0+r3*2-8]
+ mov r4, r0
+ punpckhwd mm1, mm0
+ lea r0, [r0+r3*4]
+ movq mm2, [r0+r3*1-8]
+ punpckhbw mm2, [r0+r3*0-8]
+ lea r0, [r0+r3*2]
+ movq mm3, [r0+r3*1-8]
+ punpckhbw mm3, [r0+r3*0-8]
+ punpckhwd mm3, mm2
+ punpckhdq mm3, mm1
+ lea r0, [r0+r3*2]
+ movq mm0, [r0+r3*0-8]
+ movq mm1, [r4]
+ mov r0, r4
+ movq mm4, mm3
+ movq mm2, mm3
+ PALIGNR mm4, mm0, 7, mm0
+ PALIGNR mm1, mm2, 1, mm2
+ test r1d, r1d
+ jnz .do_left
+.fix_lt_1:
+ movq mm5, mm3
+ pxor mm5, mm4
+ psrlq mm5, 56
+ psllq mm5, 48
+ pxor mm1, mm5
+ jmp .do_left
+.fix_lt_2:
+ movq mm5, mm3
+ pxor mm5, mm2
+ psllq mm5, 56
+ psrlq mm5, 56
+ pxor mm2, mm5
+ test r2d, r2d
+ jnz .do_top
+.fix_tr_1:
+ movq mm5, mm3
+ pxor mm5, mm1
+ psrlq mm5, 56
+ psllq mm5, 56
+ pxor mm1, mm5
+ jmp .do_top
+.do_left:
+ movq mm0, mm4
+ PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
+ movq2dq xmm0, mm2
+ movq mm0, [r0-8]
+ movq mm3, [r0]
+ movq mm1, [r0+8]
+ movq mm2, mm3
+ movq mm4, mm3
+ PALIGNR mm2, mm0, 7, mm0
+ PALIGNR mm1, mm4, 1, mm4
+ test r1d, r1d
+ jz .fix_lt_2
+ test r2d, r2d
+ jz .fix_tr_1
+.do_top:
+ PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
+ lea r1, [r0+r3*2]
+ movq2dq xmm4, mm6
+ pslldq xmm4, 8
+ por xmm0, xmm4
+ movdqa xmm6, [pw_ff00]
+ movdqa xmm1, xmm0
+ lea r2, [r1+r3*2]
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm0
+ pslldq xmm0, 1
+ pslldq xmm1, 2
+ pavgb xmm2, xmm0
+INIT_XMM cpuname
+ PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5
+ pandn xmm6, xmm4
+ movdqa xmm5, xmm4
+ psrlw xmm4, 8
+ packuswb xmm6, xmm4
+ movhlps xmm4, xmm6
+ movhps [r0+r3*2], xmm5
+ movhps [r0+r3*1], xmm2
+ psrldq xmm5, 4
+ movss xmm5, xmm6
+ psrldq xmm2, 4
+ movss xmm2, xmm4
+ lea r0, [r2+r3*2]
+ psrldq xmm5, 1
+ psrldq xmm2, 1
+ movq [r0+r3*2], xmm5
+ movq [r0+r3*1], xmm2
+ psrldq xmm5, 1
+ psrldq xmm2, 1
+ movq [r2+r3*2], xmm5
+ movq [r2+r3*1], xmm2
+ psrldq xmm5, 1
+ psrldq xmm2, 1
+ movq [r1+r3*2], xmm5
+ movq [r1+r3*1], xmm2
+ RET
+%endmacro
+
+INIT_MMX sse2
+PRED8x8L_VERTICAL_RIGHT
+INIT_MMX ssse3
+PRED8x8L_VERTICAL_RIGHT
+
+;-----------------------------------------------------------------------------
+; void ff_pred8x8l_vertical_left_8(uint8_t *src, int has_topleft,
+; int has_topright, ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+
+%macro PRED8x8L_VERTICAL_LEFT 0
+cglobal pred8x8l_vertical_left_8, 4,4
+ sub r0, r3
+ movq mm0, [r0-8]
+ movq mm3, [r0]
+ movq mm1, [r0+8]
+ movq mm2, mm3
+ movq mm4, mm3
+ PALIGNR mm2, mm0, 7, mm0
+ PALIGNR mm1, mm4, 1, mm4
+ test r1d, r1d
+ jz .fix_lt_2
+ test r2d, r2d
+ jz .fix_tr_1
+ jmp .do_top
+.fix_lt_2:
+ movq mm5, mm3
+ pxor mm5, mm2
+ psllq mm5, 56
+ psrlq mm5, 56
+ pxor mm2, mm5
+ test r2d, r2d
+ jnz .do_top
+.fix_tr_1:
+ movq mm5, mm3
+ pxor mm5, mm1
+ psrlq mm5, 56
+ psllq mm5, 56
+ pxor mm1, mm5
+ jmp .do_top
+.fix_tr_2:
+ punpckhbw mm3, mm3
+ pshufw mm1, mm3, 0xFF
+ jmp .do_topright
+.do_top:
+ PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
+ movq2dq xmm4, mm4
+ test r2d, r2d
+ jz .fix_tr_2
+ movq mm0, [r0+8]
+ movq mm5, mm0
+ movq mm2, mm0
+ movq mm4, mm0
+ psrlq mm5, 56
+ PALIGNR mm2, mm3, 7, mm3
+ PALIGNR mm5, mm4, 1, mm4
+ PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
+.do_topright:
+ movq2dq xmm3, mm1
+ lea r1, [r0+r3*2]
+ pslldq xmm3, 8
+ por xmm4, xmm3
+ movdqa xmm2, xmm4
+ movdqa xmm1, xmm4
+ movdqa xmm3, xmm4
+ psrldq xmm2, 1
+ pslldq xmm1, 1
+ pavgb xmm3, xmm2
+ lea r2, [r1+r3*2]
+INIT_XMM cpuname
+ PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5
+ psrldq xmm0, 1
+ movq [r0+r3*1], xmm3
+ movq [r0+r3*2], xmm0
+ lea r0, [r2+r3*2]
+ psrldq xmm3, 1
+ psrldq xmm0, 1
+ movq [r1+r3*1], xmm3
+ movq [r1+r3*2], xmm0
+ psrldq xmm3, 1
+ psrldq xmm0, 1
+ movq [r2+r3*1], xmm3
+ movq [r2+r3*2], xmm0
+ psrldq xmm3, 1
+ psrldq xmm0, 1
+ movq [r0+r3*1], xmm3
+ movq [r0+r3*2], xmm0
+ RET
+%endmacro
+
+INIT_MMX sse2
+PRED8x8L_VERTICAL_LEFT
+INIT_MMX ssse3
+PRED8x8L_VERTICAL_LEFT
+
+;-----------------------------------------------------------------------------
+; void ff_pred8x8l_horizontal_up_8(uint8_t *src, int has_topleft,
+; int has_topright, ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+
+%macro PRED8x8L_HORIZONTAL_UP 0
+cglobal pred8x8l_horizontal_up_8, 4,4
+ sub r0, r3
+ lea r2, [r0+r3*2]
+ movq mm0, [r0+r3*1-8]
+ test r1d, r1d
+ lea r1, [r0+r3]
+ cmovnz r1, r0
+ punpckhbw mm0, [r1+r3*0-8]
+ movq mm1, [r2+r3*1-8]
+ punpckhbw mm1, [r0+r3*2-8]
+ mov r2, r0
+ punpckhwd mm1, mm0
+ lea r0, [r0+r3*4]
+ movq mm2, [r0+r3*1-8]
+ punpckhbw mm2, [r0+r3*0-8]
+ lea r0, [r0+r3*2]
+ movq mm3, [r0+r3*1-8]
+ punpckhbw mm3, [r0+r3*0-8]
+ punpckhwd mm3, mm2
+ punpckhdq mm3, mm1
+ lea r0, [r0+r3*2]
+ movq mm0, [r0+r3*0-8]
+ movq mm1, [r1+r3*0-8]
+ mov r0, r2
+ movq mm4, mm3
+ movq mm2, mm3
+ PALIGNR mm4, mm0, 7, mm0
+ PALIGNR mm1, mm2, 1, mm2
+ movq mm0, mm4
+ PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
+ movq mm4, mm0
+ movq mm7, mm2
+ PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
+ psllq mm1, 56
+ PALIGNR mm7, mm1, 7, mm3
+ lea r1, [r0+r3*2]
+ pshufw mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
+ psllq mm7, 56 ; l7 .. .. .. .. .. .. ..
+ movq mm2, mm0
+ psllw mm0, 8
+ psrlw mm2, 8
+ por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0
+ movq mm3, mm2
+ movq mm4, mm2
+ movq mm5, mm2
+ psrlq mm2, 8
+ psrlq mm3, 16
+ lea r2, [r1+r3*2]
+ por mm2, mm7 ; l7 l7 l6 l5 l4 l3 l2 l1
+ punpckhbw mm7, mm7
+ por mm3, mm7 ; l7 l7 l7 l6 l5 l4 l3 l2
+ pavgb mm4, mm2
+ PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6
+ movq mm5, mm4
+ punpcklbw mm4, mm1 ; p4 p3 p2 p1
+ punpckhbw mm5, mm1 ; p8 p7 p6 p5
+ movq mm6, mm5
+ movq mm7, mm5
+ movq mm0, mm5
+ PALIGNR mm5, mm4, 2, mm1
+ pshufw mm1, mm6, 11111001b
+ PALIGNR mm6, mm4, 4, mm2
+ pshufw mm2, mm7, 11111110b
+ PALIGNR mm7, mm4, 6, mm3
+ pshufw mm3, mm0, 11111111b
+ movq [r0+r3*1], mm4
+ movq [r0+r3*2], mm5
+ lea r0, [r2+r3*2]
+ movq [r1+r3*1], mm6
+ movq [r1+r3*2], mm7
+ movq [r2+r3*1], mm0
+ movq [r2+r3*2], mm1
+ movq [r0+r3*1], mm2
+ movq [r0+r3*2], mm3
+ RET
+%endmacro
+
+INIT_MMX mmxext
+PRED8x8L_HORIZONTAL_UP
+INIT_MMX ssse3
+PRED8x8L_HORIZONTAL_UP
+
+;-----------------------------------------------------------------------------
+; void ff_pred8x8l_horizontal_down_8(uint8_t *src, int has_topleft,
+; int has_topright, ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+
+INIT_MMX mmxext
+cglobal pred8x8l_horizontal_down_8, 4,5
+ sub r0, r3
+ lea r4, [r0+r3*2]
+ movq mm0, [r0+r3*1-8]
+ punpckhbw mm0, [r0+r3*0-8]
+ movq mm1, [r4+r3*1-8]
+ punpckhbw mm1, [r0+r3*2-8]
+ mov r4, r0
+ punpckhwd mm1, mm0
+ lea r0, [r0+r3*4]
+ movq mm2, [r0+r3*1-8]
+ punpckhbw mm2, [r0+r3*0-8]
+ lea r0, [r0+r3*2]
+ movq mm3, [r0+r3*1-8]
+ punpckhbw mm3, [r0+r3*0-8]
+ punpckhwd mm3, mm2
+ punpckhdq mm3, mm1
+ lea r0, [r0+r3*2]
+ movq mm0, [r0+r3*0-8]
+ movq mm1, [r4]
+ mov r0, r4
+ movq mm4, mm3
+ movq mm2, mm3
+ PALIGNR mm4, mm0, 7, mm0
+ PALIGNR mm1, mm2, 1, mm2
+ test r1d, r1d
+ jnz .do_left
+.fix_lt_1:
+ movq mm5, mm3
+ pxor mm5, mm4
+ psrlq mm5, 56
+ psllq mm5, 48
+ pxor mm1, mm5
+ jmp .do_left
+.fix_lt_2:
+ movq mm5, mm3
+ pxor mm5, mm2
+ psllq mm5, 56
+ psrlq mm5, 56
+ pxor mm2, mm5
+ test r2d, r2d
+ jnz .do_top
+.fix_tr_1:
+ movq mm5, mm3
+ pxor mm5, mm1
+ psrlq mm5, 56
+ psllq mm5, 56
+ pxor mm1, mm5
+ jmp .do_top
+.do_left:
+ movq mm0, mm4
+ PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
+ movq mm4, mm0
+ movq mm7, mm2
+ movq mm6, mm2
+ PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
+ psllq mm1, 56
+ PALIGNR mm7, mm1, 7, mm3
+ movq mm0, [r0-8]
+ movq mm3, [r0]
+ movq mm1, [r0+8]
+ movq mm2, mm3
+ movq mm4, mm3
+ PALIGNR mm2, mm0, 7, mm0
+ PALIGNR mm1, mm4, 1, mm4
+ test r1d, r1d
+ jz .fix_lt_2
+ test r2d, r2d
+ jz .fix_tr_1
+.do_top:
+ PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
+ movq mm5, mm4
+ lea r1, [r0+r3*2]
+ psllq mm7, 56
+ movq mm2, mm5
+ movq mm3, mm6
+ movq mm4, mm2
+ PALIGNR mm2, mm6, 7, mm5
+ PALIGNR mm6, mm7, 7, mm0
+ lea r2, [r1+r3*2]
+ PALIGNR mm4, mm3, 1, mm7
+ movq mm5, mm3
+ pavgb mm3, mm6
+ PRED4x4_LOWPASS mm0, mm4, mm6, mm5, mm7
+ movq mm4, mm2
+ movq mm1, mm2
+ lea r4, [r2+r3*2]
+ psrlq mm4, 16
+ psrlq mm1, 8
+ PRED4x4_LOWPASS mm6, mm4, mm2, mm1, mm5
+ movq mm7, mm3
+ punpcklbw mm3, mm0
+ punpckhbw mm7, mm0
+ movq mm1, mm7
+ movq mm0, mm7
+ movq mm4, mm7
+ movq [r4+r3*2], mm3
+ PALIGNR mm7, mm3, 2, mm5
+ movq [r4+r3*1], mm7
+ PALIGNR mm1, mm3, 4, mm5
+ movq [r2+r3*2], mm1
+ PALIGNR mm0, mm3, 6, mm3
+ movq [r2+r3*1], mm0
+ movq mm2, mm6
+ movq mm3, mm6
+ movq [r1+r3*2], mm4
+ PALIGNR mm6, mm4, 2, mm5
+ movq [r1+r3*1], mm6
+ PALIGNR mm2, mm4, 4, mm5
+ movq [r0+r3*2], mm2
+ PALIGNR mm3, mm4, 6, mm4
+ movq [r0+r3*1], mm3
+ RET
+
+%macro PRED8x8L_HORIZONTAL_DOWN 0
+cglobal pred8x8l_horizontal_down_8, 4,5
+ sub r0, r3
+ lea r4, [r0+r3*2]
+ movq mm0, [r0+r3*1-8]
+ punpckhbw mm0, [r0+r3*0-8]
+ movq mm1, [r4+r3*1-8]
+ punpckhbw mm1, [r0+r3*2-8]
+ mov r4, r0
+ punpckhwd mm1, mm0
+ lea r0, [r0+r3*4]
+ movq mm2, [r0+r3*1-8]
+ punpckhbw mm2, [r0+r3*0-8]
+ lea r0, [r0+r3*2]
+ movq mm3, [r0+r3*1-8]
+ punpckhbw mm3, [r0+r3*0-8]
+ punpckhwd mm3, mm2
+ punpckhdq mm3, mm1
+ lea r0, [r0+r3*2]
+ movq mm0, [r0+r3*0-8]
+ movq mm1, [r4]
+ mov r0, r4
+ movq mm4, mm3
+ movq mm2, mm3
+ PALIGNR mm4, mm0, 7, mm0
+ PALIGNR mm1, mm2, 1, mm2
+ test r1d, r1d
+ jnz .do_left
+.fix_lt_1:
+ movq mm5, mm3
+ pxor mm5, mm4
+ psrlq mm5, 56
+ psllq mm5, 48
+ pxor mm1, mm5
+ jmp .do_left
+.fix_lt_2:
+ movq mm5, mm3
+ pxor mm5, mm2
+ psllq mm5, 56
+ psrlq mm5, 56
+ pxor mm2, mm5
+ test r2d, r2d
+ jnz .do_top
+.fix_tr_1:
+ movq mm5, mm3
+ pxor mm5, mm1
+ psrlq mm5, 56
+ psllq mm5, 56
+ pxor mm1, mm5
+ jmp .do_top
+.fix_tr_2:
+ punpckhbw mm3, mm3
+ pshufw mm1, mm3, 0xFF
+ jmp .do_topright
+.do_left:
+ movq mm0, mm4
+ PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
+ movq2dq xmm0, mm2
+ pslldq xmm0, 8
+ movq mm4, mm0
+ PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
+ movq2dq xmm2, mm1
+ pslldq xmm2, 15
+ psrldq xmm2, 8
+ por xmm0, xmm2
+ movq mm0, [r0-8]
+ movq mm3, [r0]
+ movq mm1, [r0+8]
+ movq mm2, mm3
+ movq mm4, mm3
+ PALIGNR mm2, mm0, 7, mm0
+ PALIGNR mm1, mm4, 1, mm4
+ test r1d, r1d
+ jz .fix_lt_2
+ test r2d, r2d
+ jz .fix_tr_1
+.do_top:
+ PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
+ movq2dq xmm1, mm4
+ test r2d, r2d
+ jz .fix_tr_2
+ movq mm0, [r0+8]
+ movq mm5, mm0
+ movq mm2, mm0
+ movq mm4, mm0
+ psrlq mm5, 56
+ PALIGNR mm2, mm3, 7, mm3
+ PALIGNR mm5, mm4, 1, mm4
+ PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
+.do_topright:
+ movq2dq xmm5, mm1
+ pslldq xmm5, 8
+ por xmm1, xmm5
+INIT_XMM cpuname
+ lea r2, [r4+r3*2]
+ movdqa xmm2, xmm1
+ movdqa xmm3, xmm1
+ PALIGNR xmm1, xmm0, 7, xmm4
+ PALIGNR xmm2, xmm0, 9, xmm5
+ lea r1, [r2+r3*2]
+ PALIGNR xmm3, xmm0, 8, xmm0
+ movdqa xmm4, xmm1
+ pavgb xmm4, xmm3
+ lea r0, [r1+r3*2]
+ PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5
+ punpcklbw xmm4, xmm0
+ movhlps xmm0, xmm4
+ movq [r0+r3*2], xmm4
+ movq [r2+r3*2], xmm0
+ psrldq xmm4, 2
+ psrldq xmm0, 2
+ movq [r0+r3*1], xmm4
+ movq [r2+r3*1], xmm0
+ psrldq xmm4, 2
+ psrldq xmm0, 2
+ movq [r1+r3*2], xmm4
+ movq [r4+r3*2], xmm0
+ psrldq xmm4, 2
+ psrldq xmm0, 2
+ movq [r1+r3*1], xmm4
+ movq [r4+r3*1], xmm0
+ RET
+%endmacro
+
+INIT_MMX sse2
+PRED8x8L_HORIZONTAL_DOWN
+INIT_MMX ssse3
+PRED8x8L_HORIZONTAL_DOWN
+
+;-------------------------------------------------------------------------------
+; void ff_pred4x4_dc_8_mmxext(uint8_t *src, const uint8_t *topright,
+; ptrdiff_t stride)
+;-------------------------------------------------------------------------------
+
+INIT_MMX mmxext
+cglobal pred4x4_dc_8, 3,5
+ pxor mm7, mm7
+ mov r4, r0
+ sub r0, r2
+ movd mm0, [r0]
+ psadbw mm0, mm7
+ movzx r1d, byte [r0+r2*1-1]
+ movd r3d, mm0
+ add r3d, r1d
+ movzx r1d, byte [r0+r2*2-1]
+ lea r0, [r0+r2*2]
+ add r3d, r1d
+ movzx r1d, byte [r0+r2*1-1]
+ add r3d, r1d
+ movzx r1d, byte [r0+r2*2-1]
+ add r3d, r1d
+ add r3d, 4
+ shr r3d, 3
+ imul r3d, 0x01010101
+ mov [r4+r2*0], r3d
+ mov [r0+r2*0], r3d
+ mov [r0+r2*1], r3d
+ mov [r0+r2*2], r3d
+ RET
+
+;-----------------------------------------------------------------------------
+; void ff_pred4x4_tm_vp8_8_mmxext(uint8_t *src, const uint8_t *topright,
+; ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+
+%macro PRED4x4_TM 0
+cglobal pred4x4_tm_vp8_8, 3,6
+ sub r0, r2
+ pxor mm7, mm7
+ movd mm0, [r0]
+ punpcklbw mm0, mm7
+ movzx r4d, byte [r0-1]
+ mov r5d, 2
+.loop:
+ movzx r1d, byte [r0+r2*1-1]
+ movzx r3d, byte [r0+r2*2-1]
+ sub r1d, r4d
+ sub r3d, r4d
+ movd mm2, r1d
+ movd mm4, r3d
+%if cpuflag(mmxext)
+ pshufw mm2, mm2, 0
+ pshufw mm4, mm4, 0
+%else
+ punpcklwd mm2, mm2
+ punpcklwd mm4, mm4
+ punpckldq mm2, mm2
+ punpckldq mm4, mm4
+%endif
+ paddw mm2, mm0
+ paddw mm4, mm0
+ packuswb mm2, mm2
+ packuswb mm4, mm4
+ movd [r0+r2*1], mm2
+ movd [r0+r2*2], mm4
+ lea r0, [r0+r2*2]
+ dec r5d
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmx
+PRED4x4_TM
+INIT_MMX mmxext
+PRED4x4_TM
+
+INIT_XMM ssse3
+cglobal pred4x4_tm_vp8_8, 3,3
+ sub r0, r2
+ movq mm6, [tm_shuf]
+ pxor mm1, mm1
+ movd mm0, [r0]
+ punpcklbw mm0, mm1
+ movd mm7, [r0-4]
+ pshufb mm7, mm6
+ lea r1, [r0+r2*2]
+ movd mm2, [r0+r2*1-4]
+ movd mm3, [r0+r2*2-4]
+ movd mm4, [r1+r2*1-4]
+ movd mm5, [r1+r2*2-4]
+ pshufb mm2, mm6
+ pshufb mm3, mm6
+ pshufb mm4, mm6
+ pshufb mm5, mm6
+ psubw mm0, mm7
+ paddw mm2, mm0
+ paddw mm3, mm0
+ paddw mm4, mm0
+ paddw mm5, mm0
+ packuswb mm2, mm2
+ packuswb mm3, mm3
+ packuswb mm4, mm4
+ packuswb mm5, mm5
+ movd [r0+r2*1], mm2
+ movd [r0+r2*2], mm3
+ movd [r1+r2*1], mm4
+ movd [r1+r2*2], mm5
+ RET
+
+;-----------------------------------------------------------------------------
+; void ff_pred4x4_vertical_vp8_8_mmxext(uint8_t *src, const uint8_t *topright,
+; ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+
+INIT_MMX mmxext
+cglobal pred4x4_vertical_vp8_8, 3,3
+ sub r0, r2
+ movd m1, [r0-1]
+ movd m0, [r0]
+ mova m2, m0 ;t0 t1 t2 t3
+ punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7
+ lea r1, [r0+r2*2]
+ psrlq m0, 8 ;t1 t2 t3 t4
+ PRED4x4_LOWPASS m3, m1, m0, m2, m4
+ movd [r0+r2*1], m3
+ movd [r0+r2*2], m3
+ movd [r1+r2*1], m3
+ movd [r1+r2*2], m3
+ RET
+
+;-----------------------------------------------------------------------------
+; void ff_pred4x4_down_left_8_mmxext(uint8_t *src, const uint8_t *topright,
+; ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+INIT_MMX mmxext
+cglobal pred4x4_down_left_8, 3,3
+ sub r0, r2
+ movq m1, [r0]
+ punpckldq m1, [r1]
+ movq m2, m1
+ movq m3, m1
+ psllq m1, 8
+ pxor m2, m1
+ psrlq m2, 8
+ pxor m2, m3
+ PRED4x4_LOWPASS m0, m1, m2, m3, m4
+ lea r1, [r0+r2*2]
+ psrlq m0, 8
+ movd [r0+r2*1], m0
+ psrlq m0, 8
+ movd [r0+r2*2], m0
+ psrlq m0, 8
+ movd [r1+r2*1], m0
+ psrlq m0, 8
+ movd [r1+r2*2], m0
+ RET
+
+;------------------------------------------------------------------------------
+; void ff_pred4x4_vertical_left_8_mmxext(uint8_t *src, const uint8_t *topright,
+; ptrdiff_t stride)
+;------------------------------------------------------------------------------
+
+INIT_MMX mmxext
+cglobal pred4x4_vertical_left_8, 3,3
+ sub r0, r2
+ movq m1, [r0]
+ punpckldq m1, [r1]
+ movq m3, m1
+ movq m2, m1
+ psrlq m3, 8
+ psrlq m2, 16
+ movq m4, m3
+ pavgb m4, m1
+ PRED4x4_LOWPASS m0, m1, m2, m3, m5
+ lea r1, [r0+r2*2]
+ movh [r0+r2*1], m4
+ movh [r0+r2*2], m0
+ psrlq m4, 8
+ psrlq m0, 8
+ movh [r1+r2*1], m4
+ movh [r1+r2*2], m0
+ RET
+
+;------------------------------------------------------------------------------
+; void ff_pred4x4_horizontal_up_8_mmxext(uint8_t *src, const uint8_t *topright,
+; ptrdiff_t stride)
+;------------------------------------------------------------------------------
+
+INIT_MMX mmxext
+cglobal pred4x4_horizontal_up_8, 3,3
+ sub r0, r2
+ lea r1, [r0+r2*2]
+ movd m0, [r0+r2*1-4]
+ punpcklbw m0, [r0+r2*2-4]
+ movd m1, [r1+r2*1-4]
+ punpcklbw m1, [r1+r2*2-4]
+ punpckhwd m0, m1
+ movq m1, m0
+ punpckhbw m1, m1
+ pshufw m1, m1, 0xFF
+ punpckhdq m0, m1
+ movq m2, m0
+ movq m3, m0
+ movq m7, m0
+ psrlq m2, 16
+ psrlq m3, 8
+ pavgb m7, m3
+ PRED4x4_LOWPASS m4, m0, m2, m3, m5
+ punpcklbw m7, m4
+ movd [r0+r2*1], m7
+ psrlq m7, 16
+ movd [r0+r2*2], m7
+ psrlq m7, 16
+ movd [r1+r2*1], m7
+ movd [r1+r2*2], m1
+ RET
+
+;------------------------------------------------------------------------------
+; void ff_pred4x4_horizontal_down_8_mmxext(uint8_t *src,
+; const uint8_t *topright,
+; ptrdiff_t stride)
+;------------------------------------------------------------------------------
+
+INIT_MMX mmxext
+cglobal pred4x4_horizontal_down_8, 3,3
+ sub r0, r2
+ lea r1, [r0+r2*2]
+ movh m0, [r0-4] ; lt ..
+ punpckldq m0, [r0] ; t3 t2 t1 t0 lt .. .. ..
+ psllq m0, 8 ; t2 t1 t0 lt .. .. .. ..
+ movd m1, [r1+r2*2-4] ; l3
+ punpcklbw m1, [r1+r2*1-4] ; l2 l3
+ movd m2, [r0+r2*2-4] ; l1
+ punpcklbw m2, [r0+r2*1-4] ; l0 l1
+ punpckhwd m1, m2 ; l0 l1 l2 l3
+ punpckhdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
+ movq m0, m1
+ movq m2, m1
+ movq m5, m1
+ psrlq m0, 16 ; .. .. t2 t1 t0 lt l0 l1
+ psrlq m2, 8 ; .. t2 t1 t0 lt l0 l1 l2
+ pavgb m5, m2
+ PRED4x4_LOWPASS m3, m1, m0, m2, m4
+ punpcklbw m5, m3
+ psrlq m3, 32
+ PALIGNR m3, m5, 6, m4
+ movh [r1+r2*2], m5
+ psrlq m5, 16
+ movh [r1+r2*1], m5
+ psrlq m5, 16
+ movh [r0+r2*2], m5
+ movh [r0+r2*1], m3
+ RET
+
+;-----------------------------------------------------------------------------
+; void ff_pred4x4_vertical_right_8_mmxext(uint8_t *src,
+; const uint8_t *topright,
+; ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+
+INIT_MMX mmxext
+cglobal pred4x4_vertical_right_8, 3,3
+ sub r0, r2
+ lea r1, [r0+r2*2]
+ movh m0, [r0] ; ........t3t2t1t0
+ movq m5, m0
+ PALIGNR m0, [r0-8], 7, m1 ; ......t3t2t1t0lt
+ pavgb m5, m0
+ PALIGNR m0, [r0+r2*1-8], 7, m1 ; ....t3t2t1t0ltl0
+ movq m1, m0
+ PALIGNR m0, [r0+r2*2-8], 7, m2 ; ..t3t2t1t0ltl0l1
+ movq m2, m0
+ PALIGNR m0, [r1+r2*1-8], 7, m3 ; t3t2t1t0ltl0l1l2
+ PRED4x4_LOWPASS m3, m1, m0, m2, m4
+ movq m1, m3
+ psrlq m3, 16
+ psllq m1, 48
+ movh [r0+r2*1], m5
+ movh [r0+r2*2], m3
+ PALIGNR m5, m1, 7, m2
+ psllq m1, 8
+ movh [r1+r2*1], m5
+ PALIGNR m3, m1, 7, m1
+ movh [r1+r2*2], m3
+ RET
+
+;-----------------------------------------------------------------------------
+; void ff_pred4x4_down_right_8_mmxext(uint8_t *src, const uint8_t *topright,
+; ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+
+INIT_MMX mmxext
+cglobal pred4x4_down_right_8, 3,3
+ sub r0, r2
+ lea r1, [r0+r2*2]
+ movq m1, [r1-8]
+ movq m2, [r0+r2*1-8]
+ punpckhbw m2, [r0-8]
+ movh m3, [r0]
+ punpckhwd m1, m2
+ PALIGNR m3, m1, 5, m1
+ movq m1, m3
+ PALIGNR m3, [r1+r2*1-8], 7, m4
+ movq m2, m3
+ PALIGNR m3, [r1+r2*2-8], 7, m4
+ PRED4x4_LOWPASS m0, m3, m1, m2, m4
+ movh [r1+r2*2], m0
+ psrlq m0, 8
+ movh [r1+r2*1], m0
+ psrlq m0, 8
+ movh [r0+r2*2], m0
+ psrlq m0, 8
+ movh [r0+r2*1], m0
+ RET
diff --git a/libs/ffvpx/libavcodec/x86/h264_intrapred_10bit.asm b/libs/ffvpx/libavcodec/x86/h264_intrapred_10bit.asm
new file mode 100644
index 000000000..629e0a72e
--- /dev/null
+++ b/libs/ffvpx/libavcodec/x86/h264_intrapred_10bit.asm
@@ -0,0 +1,1199 @@
+;*****************************************************************************
+;* MMX/SSE2/AVX-optimized 10-bit H.264 intra prediction code
+;*****************************************************************************
+;* Copyright (C) 2005-2011 x264 project
+;*
+;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+cextern pw_1023
+%define pw_pixel_max pw_1023
+cextern pw_512
+cextern pw_16
+cextern pw_8
+cextern pw_4
+cextern pw_2
+cextern pw_1
+cextern pd_16
+
+pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4
+pw_m3: times 8 dw -3
+pd_17: times 4 dd 17
+
+SECTION .text
+
+; dest, left, right, src
+; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
+%macro PRED4x4_LOWPASS 4
+ paddw %2, %3
+ psrlw %2, 1
+ pavgw %1, %4, %2
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void ff_pred4x4_down_right_10(pixel *src, const pixel *topright,
+; ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+%macro PRED4x4_DR 0
+cglobal pred4x4_down_right_10, 3, 3
+ sub r0, r2
+ lea r1, [r0+r2*2]
+ movhps m1, [r1-8]
+ movhps m2, [r0+r2*1-8]
+ movhps m4, [r0-8]
+ punpckhwd m2, m4
+ movq m3, [r0]
+ punpckhdq m1, m2
+ PALIGNR m3, m1, 10, m1
+ movhps m4, [r1+r2*1-8]
+ PALIGNR m0, m3, m4, 14, m4
+ movhps m4, [r1+r2*2-8]
+ PALIGNR m2, m0, m4, 14, m4
+ PRED4x4_LOWPASS m0, m2, m3, m0
+ movq [r1+r2*2], m0
+ psrldq m0, 2
+ movq [r1+r2*1], m0
+ psrldq m0, 2
+ movq [r0+r2*2], m0
+ psrldq m0, 2
+ movq [r0+r2*1], m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+PRED4x4_DR
+INIT_XMM ssse3
+PRED4x4_DR
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+PRED4x4_DR
+%endif
+
+;------------------------------------------------------------------------------
+; void ff_pred4x4_vertical_right_10(pixel *src, const pixel *topright,
+; ptrdiff_t stride)
+;------------------------------------------------------------------------------
+%macro PRED4x4_VR 0
+cglobal pred4x4_vertical_right_10, 3, 3, 6
+ sub r0, r2
+ lea r1, [r0+r2*2]
+ movq m5, [r0] ; ........t3t2t1t0
+ movhps m1, [r0-8]
+ PALIGNR m0, m5, m1, 14, m1 ; ......t3t2t1t0lt
+ pavgw m5, m0
+ movhps m1, [r0+r2*1-8]
+ PALIGNR m0, m1, 14, m1 ; ....t3t2t1t0ltl0
+ movhps m2, [r0+r2*2-8]
+ PALIGNR m1, m0, m2, 14, m2 ; ..t3t2t1t0ltl0l1
+ movhps m3, [r1+r2*1-8]
+ PALIGNR m2, m1, m3, 14, m3 ; t3t2t1t0ltl0l1l2
+ PRED4x4_LOWPASS m1, m0, m2, m1
+ pslldq m0, m1, 12
+ psrldq m1, 4
+ movq [r0+r2*1], m5
+ movq [r0+r2*2], m1
+ PALIGNR m5, m0, 14, m2
+ pslldq m0, 2
+ movq [r1+r2*1], m5
+ PALIGNR m1, m0, 14, m0
+ movq [r1+r2*2], m1
+ RET
+%endmacro
+
+INIT_XMM sse2
+PRED4x4_VR
+INIT_XMM ssse3
+PRED4x4_VR
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+PRED4x4_VR
+%endif
+
+;-------------------------------------------------------------------------------
+; void ff_pred4x4_horizontal_down_10(pixel *src, const pixel *topright,
+; ptrdiff_t stride)
+;-------------------------------------------------------------------------------
+%macro PRED4x4_HD 0
+cglobal pred4x4_horizontal_down_10, 3, 3
+ sub r0, r2
+ lea r1, [r0+r2*2]
+ movq m0, [r0-8] ; lt ..
+ movhps m0, [r0]
+ pslldq m0, 2 ; t2 t1 t0 lt .. .. .. ..
+ movq m1, [r1+r2*2-8] ; l3
+ movq m3, [r1+r2*1-8]
+ punpcklwd m1, m3 ; l2 l3
+ movq m2, [r0+r2*2-8] ; l1
+ movq m3, [r0+r2*1-8]
+ punpcklwd m2, m3 ; l0 l1
+ punpckhdq m1, m2 ; l0 l1 l2 l3
+ punpckhqdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
+ psrldq m0, m1, 4 ; .. .. t2 t1 t0 lt l0 l1
+ psrldq m3, m1, 2 ; .. t2 t1 t0 lt l0 l1 l2
+ pavgw m5, m1, m3
+ PRED4x4_LOWPASS m3, m1, m0, m3
+ punpcklwd m5, m3
+ psrldq m3, 8
+ PALIGNR m3, m5, 12, m4
+ movq [r1+r2*2], m5
+ movhps [r0+r2*2], m5
+ psrldq m5, 4
+ movq [r1+r2*1], m5
+ movq [r0+r2*1], m3
+ RET
+%endmacro
+
+INIT_XMM sse2
+PRED4x4_HD
+INIT_XMM ssse3
+PRED4x4_HD
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+PRED4x4_HD
+%endif
+
+;-----------------------------------------------------------------------------
+; void ff_pred4x4_dc_10(pixel *src, const pixel *topright, ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+
+INIT_MMX mmxext
+cglobal pred4x4_dc_10, 3, 3
+ sub r0, r2
+ lea r1, [r0+r2*2]
+ movq m2, [r0+r2*1-8]
+ paddw m2, [r0+r2*2-8]
+ paddw m2, [r1+r2*1-8]
+ paddw m2, [r1+r2*2-8]
+ psrlq m2, 48
+ movq m0, [r0]
+ HADDW m0, m1
+ paddw m0, [pw_4]
+ paddw m0, m2
+ psrlw m0, 3
+ SPLATW m0, m0, 0
+ movq [r0+r2*1], m0
+ movq [r0+r2*2], m0
+ movq [r1+r2*1], m0
+ movq [r1+r2*2], m0
+ RET
+
+;-----------------------------------------------------------------------------
+; void ff_pred4x4_down_left_10(pixel *src, const pixel *topright,
+; ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+%macro PRED4x4_DL 0
+cglobal pred4x4_down_left_10, 3, 3
+ sub r0, r2
+ movq m0, [r0]
+ movhps m0, [r1]
+ psrldq m2, m0, 2
+ pslldq m3, m0, 2
+ pshufhw m2, m2, 10100100b
+ PRED4x4_LOWPASS m0, m3, m2, m0
+ lea r1, [r0+r2*2]
+ movhps [r1+r2*2], m0
+ psrldq m0, 2
+ movq [r0+r2*1], m0
+ psrldq m0, 2
+ movq [r0+r2*2], m0
+ psrldq m0, 2
+ movq [r1+r2*1], m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+PRED4x4_DL
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+PRED4x4_DL
+%endif
+
+;-----------------------------------------------------------------------------
+; void ff_pred4x4_vertical_left_10(pixel *src, const pixel *topright,
+; ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+%macro PRED4x4_VL 0
+cglobal pred4x4_vertical_left_10, 3, 3
+ sub r0, r2
+ movu m1, [r0]
+ movhps m1, [r1]
+ psrldq m0, m1, 2
+ psrldq m2, m1, 4
+ pavgw m4, m0, m1
+ PRED4x4_LOWPASS m0, m1, m2, m0
+ lea r1, [r0+r2*2]
+ movq [r0+r2*1], m4
+ movq [r0+r2*2], m0
+ psrldq m4, 2
+ psrldq m0, 2
+ movq [r1+r2*1], m4
+ movq [r1+r2*2], m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+PRED4x4_VL
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+PRED4x4_VL
+%endif
+
+;-----------------------------------------------------------------------------
+; void ff_pred4x4_horizontal_up_10(pixel *src, const pixel *topright,
+; ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+INIT_MMX mmxext
+cglobal pred4x4_horizontal_up_10, 3, 3
+ sub r0, r2
+ lea r1, [r0+r2*2]
+ movq m0, [r0+r2*1-8]
+ punpckhwd m0, [r0+r2*2-8]
+ movq m1, [r1+r2*1-8]
+ punpckhwd m1, [r1+r2*2-8]
+ punpckhdq m0, m1
+ pshufw m1, m1, 0xFF
+ movq [r1+r2*2], m1
+ movd [r1+r2*1+4], m1
+ pshufw m2, m0, 11111001b
+ movq m1, m2
+ pavgw m2, m0
+
+ pshufw m5, m0, 11111110b
+ PRED4x4_LOWPASS m1, m0, m5, m1
+ movq m6, m2
+ punpcklwd m6, m1
+ movq [r0+r2*1], m6
+ psrlq m2, 16
+ psrlq m1, 16
+ punpcklwd m2, m1
+ movq [r0+r2*2], m2
+ psrlq m2, 32
+ movd [r1+r2*1], m2
+ RET
+
+
+
+;-----------------------------------------------------------------------------
+; void ff_pred8x8_vertical_10(pixel *src, ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal pred8x8_vertical_10, 2, 2
+ sub r0, r1
+ mova m0, [r0]
+%rep 3
+ mova [r0+r1*1], m0
+ mova [r0+r1*2], m0
+ lea r0, [r0+r1*2]
+%endrep
+ mova [r0+r1*1], m0
+ mova [r0+r1*2], m0
+ RET
+
+;-----------------------------------------------------------------------------
+; void ff_pred8x8_horizontal_10(pixel *src, ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal pred8x8_horizontal_10, 2, 3
+ mov r2d, 4
+.loop:
+ movq m0, [r0+r1*0-8]
+ movq m1, [r0+r1*1-8]
+ pshuflw m0, m0, 0xff
+ pshuflw m1, m1, 0xff
+ punpcklqdq m0, m0
+ punpcklqdq m1, m1
+ mova [r0+r1*0], m0
+ mova [r0+r1*1], m1
+ lea r0, [r0+r1*2]
+ dec r2d
+ jg .loop
+ REP_RET
+
+;-----------------------------------------------------------------------------
+; void ff_predict_8x8_dc_10(pixel *src, ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+%macro MOV8 2-3
+; sort of a hack, but it works
+%if mmsize==8
+ movq [%1+0], %2
+ movq [%1+8], %3
+%else
+ movdqa [%1], %2
+%endif
+%endmacro
+
+%macro PRED8x8_DC 1
+cglobal pred8x8_dc_10, 2, 6
+ sub r0, r1
+ pxor m4, m4
+ movq m0, [r0+0]
+ movq m1, [r0+8]
+%if mmsize==16
+ punpcklwd m0, m1
+ movhlps m1, m0
+ paddw m0, m1
+%else
+ pshufw m2, m0, 00001110b
+ pshufw m3, m1, 00001110b
+ paddw m0, m2
+ paddw m1, m3
+ punpcklwd m0, m1
+%endif
+ %1 m2, m0, 00001110b
+ paddw m0, m2
+
+ lea r5, [r1*3]
+ lea r4, [r0+r1*4]
+ movzx r2d, word [r0+r1*1-2]
+ movzx r3d, word [r0+r1*2-2]
+ add r2d, r3d
+ movzx r3d, word [r0+r5*1-2]
+ add r2d, r3d
+ movzx r3d, word [r4-2]
+ add r2d, r3d
+ movd m2, r2d ; s2
+
+ movzx r2d, word [r4+r1*1-2]
+ movzx r3d, word [r4+r1*2-2]
+ add r2d, r3d
+ movzx r3d, word [r4+r5*1-2]
+ add r2d, r3d
+ movzx r3d, word [r4+r1*4-2]
+ add r2d, r3d
+ movd m3, r2d ; s3
+
+ punpcklwd m2, m3
+ punpckldq m0, m2 ; s0, s1, s2, s3
+ %1 m3, m0, 11110110b ; s2, s1, s3, s3
+ %1 m0, m0, 01110100b ; s0, s1, s3, s1
+ paddw m0, m3
+ psrlw m0, 2
+ pavgw m0, m4 ; s0+s2, s1, s3, s1+s3
+%if mmsize==16
+ punpcklwd m0, m0
+ pshufd m3, m0, 11111010b
+ punpckldq m0, m0
+ SWAP 0,1
+%else
+ pshufw m1, m0, 0x00
+ pshufw m2, m0, 0x55
+ pshufw m3, m0, 0xaa
+ pshufw m4, m0, 0xff
+%endif
+ MOV8 r0+r1*1, m1, m2
+ MOV8 r0+r1*2, m1, m2
+ MOV8 r0+r5*1, m1, m2
+ MOV8 r0+r1*4, m1, m2
+ MOV8 r4+r1*1, m3, m4
+ MOV8 r4+r1*2, m3, m4
+ MOV8 r4+r5*1, m3, m4
+ MOV8 r4+r1*4, m3, m4
+ RET
+%endmacro
+
+INIT_MMX mmxext
+PRED8x8_DC pshufw
+INIT_XMM sse2
+PRED8x8_DC pshuflw
+
+;-----------------------------------------------------------------------------
+; void ff_pred8x8_top_dc_10(pixel *src, ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal pred8x8_top_dc_10, 2, 4
+ sub r0, r1
+ mova m0, [r0]
+ pshuflw m1, m0, 0x4e
+ pshufhw m1, m1, 0x4e
+ paddw m0, m1
+ pshuflw m1, m0, 0xb1
+ pshufhw m1, m1, 0xb1
+ paddw m0, m1
+ lea r2, [r1*3]
+ lea r3, [r0+r1*4]
+ paddw m0, [pw_2]
+ psrlw m0, 2
+ mova [r0+r1*1], m0
+ mova [r0+r1*2], m0
+ mova [r0+r2*1], m0
+ mova [r0+r1*4], m0
+ mova [r3+r1*1], m0
+ mova [r3+r1*2], m0
+ mova [r3+r2*1], m0
+ mova [r3+r1*4], m0
+ RET
+
+;-----------------------------------------------------------------------------
+; void ff_pred8x8_plane_10(pixel *src, ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal pred8x8_plane_10, 2, 7, 7
+ sub r0, r1
+ lea r2, [r1*3]
+ lea r3, [r0+r1*4]
+ mova m2, [r0]
+ pmaddwd m2, [pw_m32101234]
+ HADDD m2, m1
+ movd m0, [r0-4]
+ psrld m0, 14
+ psubw m2, m0 ; H
+ movd m0, [r3+r1*4-4]
+ movd m1, [r0+12]
+ paddw m0, m1
+ psllw m0, 4 ; 16*(src[7*stride-1] + src[-stride+7])
+ movzx r4d, word [r3+r1*1-2] ; src[4*stride-1]
+ movzx r5d, word [r0+r2*1-2] ; src[2*stride-1]
+ sub r4d, r5d
+ movzx r6d, word [r3+r1*2-2] ; src[5*stride-1]
+ movzx r5d, word [r0+r1*2-2] ; src[1*stride-1]
+ sub r6d, r5d
+ lea r4d, [r4+r6*2]
+ movzx r5d, word [r3+r2*1-2] ; src[6*stride-1]
+ movzx r6d, word [r0+r1*1-2] ; src[0*stride-1]
+ sub r5d, r6d
+ lea r5d, [r5*3]
+ add r4d, r5d
+ movzx r6d, word [r3+r1*4-2] ; src[7*stride-1]
+ movzx r5d, word [r0+r1*0-2] ; src[ -stride-1]
+ sub r6d, r5d
+ lea r4d, [r4+r6*4]
+ movd m3, r4d ; V
+ punpckldq m2, m3
+ pmaddwd m2, [pd_17]
+ paddd m2, [pd_16]
+ psrad m2, 5 ; b, c
+
+ mova m3, [pw_pixel_max]
+ pxor m1, m1
+ SPLATW m0, m0, 1
+ SPLATW m4, m2, 2
+ SPLATW m2, m2, 0
+ pmullw m2, [pw_m32101234] ; b
+ pmullw m5, m4, [pw_m3] ; c
+ paddw m5, [pw_16]
+ mov r2d, 8
+ add r0, r1
+.loop:
+ paddsw m6, m2, m5
+ paddsw m6, m0
+ psraw m6, 5
+ CLIPW m6, m1, m3
+ mova [r0], m6
+ paddw m5, m4
+ add r0, r1
+ dec r2d
+ jg .loop
+ REP_RET
+
+
+;-----------------------------------------------------------------------------
+; void ff_pred8x8l_128_dc_10(pixel *src, int has_topleft, int has_topright,
+; ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+%macro PRED8x8L_128_DC 0
+cglobal pred8x8l_128_dc_10, 4, 4
+ mova m0, [pw_512] ; (1<<(BIT_DEPTH-1))
+ lea r1, [r3*3]
+ lea r2, [r0+r3*4]
+ MOV8 r0+r3*0, m0, m0
+ MOV8 r0+r3*1, m0, m0
+ MOV8 r0+r3*2, m0, m0
+ MOV8 r0+r1*1, m0, m0
+ MOV8 r2+r3*0, m0, m0
+ MOV8 r2+r3*1, m0, m0
+ MOV8 r2+r3*2, m0, m0
+ MOV8 r2+r1*1, m0, m0
+ RET
+%endmacro
+
+INIT_MMX mmxext
+PRED8x8L_128_DC
+INIT_XMM sse2
+PRED8x8L_128_DC
+
+;-----------------------------------------------------------------------------
+; void ff_pred8x8l_top_dc_10(pixel *src, int has_topleft, int has_topright,
+; ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+%macro PRED8x8L_TOP_DC 0
+cglobal pred8x8l_top_dc_10, 4, 4, 6
+ sub r0, r3
+ mova m0, [r0]
+ shr r1d, 14
+ shr r2d, 13
+ neg r1
+ pslldq m1, m0, 2
+ psrldq m2, m0, 2
+ pinsrw m1, [r0+r1], 0
+ pinsrw m2, [r0+r2+14], 7
+ lea r1, [r3*3]
+ lea r2, [r0+r3*4]
+ PRED4x4_LOWPASS m0, m2, m1, m0
+ HADDW m0, m1
+ paddw m0, [pw_4]
+ psrlw m0, 3
+ SPLATW m0, m0, 0
+ mova [r0+r3*1], m0
+ mova [r0+r3*2], m0
+ mova [r0+r1*1], m0
+ mova [r0+r3*4], m0
+ mova [r2+r3*1], m0
+ mova [r2+r3*2], m0
+ mova [r2+r1*1], m0
+ mova [r2+r3*4], m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+PRED8x8L_TOP_DC
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+PRED8x8L_TOP_DC
+%endif
+
+;-------------------------------------------------------------------------------
+; void ff_pred8x8l_dc_10(pixel *src, int has_topleft, int has_topright,
+; ptrdiff_t stride)
+;-------------------------------------------------------------------------------
+;TODO: see if scalar is faster
+%macro PRED8x8L_DC 0
+cglobal pred8x8l_dc_10, 4, 6, 6
+ sub r0, r3
+ lea r4, [r0+r3*4]
+ lea r5, [r3*3]
+ mova m0, [r0+r3*2-16]
+ punpckhwd m0, [r0+r3*1-16]
+ mova m1, [r4+r3*0-16]
+ punpckhwd m1, [r0+r5*1-16]
+ punpckhdq m1, m0
+ mova m2, [r4+r3*2-16]
+ punpckhwd m2, [r4+r3*1-16]
+ mova m3, [r4+r3*4-16]
+ punpckhwd m3, [r4+r5*1-16]
+ punpckhdq m3, m2
+ punpckhqdq m3, m1
+ mova m0, [r0]
+ shr r1d, 14
+ shr r2d, 13
+ neg r1
+ pslldq m1, m0, 2
+ psrldq m2, m0, 2
+ pinsrw m1, [r0+r1], 0
+ pinsrw m2, [r0+r2+14], 7
+ not r1
+ and r1, r3
+ pslldq m4, m3, 2
+ psrldq m5, m3, 2
+ pshuflw m4, m4, 11100101b
+ pinsrw m5, [r0+r1-2], 7
+ PRED4x4_LOWPASS m3, m4, m5, m3
+ PRED4x4_LOWPASS m0, m2, m1, m0
+ paddw m0, m3
+ HADDW m0, m1
+ paddw m0, [pw_8]
+ psrlw m0, 4
+ SPLATW m0, m0
+ mova [r0+r3*1], m0
+ mova [r0+r3*2], m0
+ mova [r0+r5*1], m0
+ mova [r0+r3*4], m0
+ mova [r4+r3*1], m0
+ mova [r4+r3*2], m0
+ mova [r4+r5*1], m0
+ mova [r4+r3*4], m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+PRED8x8L_DC
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+PRED8x8L_DC
+%endif
+
+;-----------------------------------------------------------------------------
+; void ff_pred8x8l_vertical_10(pixel *src, int has_topleft, int has_topright,
+; ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+%macro PRED8x8L_VERTICAL 0
+cglobal pred8x8l_vertical_10, 4, 4, 6
+ sub r0, r3
+ mova m0, [r0]
+ shr r1d, 14
+ shr r2d, 13
+ neg r1
+ pslldq m1, m0, 2
+ psrldq m2, m0, 2
+ pinsrw m1, [r0+r1], 0
+ pinsrw m2, [r0+r2+14], 7
+ lea r1, [r3*3]
+ lea r2, [r0+r3*4]
+ PRED4x4_LOWPASS m0, m2, m1, m0
+ mova [r0+r3*1], m0
+ mova [r0+r3*2], m0
+ mova [r0+r1*1], m0
+ mova [r0+r3*4], m0
+ mova [r2+r3*1], m0
+ mova [r2+r3*2], m0
+ mova [r2+r1*1], m0
+ mova [r2+r3*4], m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+PRED8x8L_VERTICAL
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+PRED8x8L_VERTICAL
+%endif
+
+;-----------------------------------------------------------------------------
+; void ff_pred8x8l_horizontal_10(uint8_t *src, int has_topleft,
+; int has_topright, ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+%macro PRED8x8L_HORIZONTAL 0
+cglobal pred8x8l_horizontal_10, 4, 4, 5
+ mova m0, [r0-16]
+ shr r1d, 14
+ dec r1
+ and r1, r3
+ sub r1, r3
+ punpckhwd m0, [r0+r1-16]
+ mova m1, [r0+r3*2-16]
+ punpckhwd m1, [r0+r3*1-16]
+ lea r2, [r0+r3*4]
+ lea r1, [r3*3]
+ punpckhdq m1, m0
+ mova m2, [r2+r3*0-16]
+ punpckhwd m2, [r0+r1-16]
+ mova m3, [r2+r3*2-16]
+ punpckhwd m3, [r2+r3*1-16]
+ punpckhdq m3, m2
+ punpckhqdq m3, m1
+ PALIGNR m4, m3, [r2+r1-16], 14, m0
+ pslldq m0, m4, 2
+ pshuflw m0, m0, 11100101b
+ PRED4x4_LOWPASS m4, m3, m0, m4
+ punpckhwd m3, m4, m4
+ punpcklwd m4, m4
+ pshufd m0, m3, 0xff
+ pshufd m1, m3, 0xaa
+ pshufd m2, m3, 0x55
+ pshufd m3, m3, 0x00
+ mova [r0+r3*0], m0
+ mova [r0+r3*1], m1
+ mova [r0+r3*2], m2
+ mova [r0+r1*1], m3
+ pshufd m0, m4, 0xff
+ pshufd m1, m4, 0xaa
+ pshufd m2, m4, 0x55
+ pshufd m3, m4, 0x00
+ mova [r2+r3*0], m0
+ mova [r2+r3*1], m1
+ mova [r2+r3*2], m2
+ mova [r2+r1*1], m3
+ RET
+%endmacro
+
+INIT_XMM sse2
+PRED8x8L_HORIZONTAL
+INIT_XMM ssse3
+PRED8x8L_HORIZONTAL
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+PRED8x8L_HORIZONTAL
+%endif
+
+;-----------------------------------------------------------------------------
+; void ff_pred8x8l_down_left_10(pixel *src, int has_topleft, int has_topright,
+; ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+%macro PRED8x8L_DOWN_LEFT 0
+cglobal pred8x8l_down_left_10, 4, 4, 7
+ sub r0, r3
+ mova m3, [r0]
+ shr r1d, 14
+ neg r1
+ shr r2d, 13
+ pslldq m1, m3, 2
+ psrldq m2, m3, 2
+ pinsrw m1, [r0+r1], 0
+ pinsrw m2, [r0+r2+14], 7
+ PRED4x4_LOWPASS m6, m2, m1, m3
+ jz .fix_tr ; flags from shr r2d
+ mova m1, [r0+16]
+ psrldq m5, m1, 2
+ PALIGNR m2, m1, m3, 14, m3
+ pshufhw m5, m5, 10100100b
+ PRED4x4_LOWPASS m1, m2, m5, m1
+.do_topright:
+ lea r1, [r3*3]
+ psrldq m5, m1, 14
+ lea r2, [r0+r3*4]
+ PALIGNR m2, m1, m6, 2, m0
+ PALIGNR m3, m1, m6, 14, m0
+ PALIGNR m5, m1, 2, m0
+ pslldq m4, m6, 2
+ PRED4x4_LOWPASS m6, m4, m2, m6
+ PRED4x4_LOWPASS m1, m3, m5, m1
+ mova [r2+r3*4], m1
+ PALIGNR m1, m6, 14, m2
+ pslldq m6, 2
+ mova [r2+r1*1], m1
+ PALIGNR m1, m6, 14, m2
+ pslldq m6, 2
+ mova [r2+r3*2], m1
+ PALIGNR m1, m6, 14, m2
+ pslldq m6, 2
+ mova [r2+r3*1], m1
+ PALIGNR m1, m6, 14, m2
+ pslldq m6, 2
+ mova [r0+r3*4], m1
+ PALIGNR m1, m6, 14, m2
+ pslldq m6, 2
+ mova [r0+r1*1], m1
+ PALIGNR m1, m6, 14, m2
+ pslldq m6, 2
+ mova [r0+r3*2], m1
+ PALIGNR m1, m6, 14, m6
+ mova [r0+r3*1], m1
+ RET
+.fix_tr:
+ punpckhwd m3, m3
+ pshufd m1, m3, 0xFF
+ jmp .do_topright
+%endmacro
+
+INIT_XMM sse2
+PRED8x8L_DOWN_LEFT
+INIT_XMM ssse3
+PRED8x8L_DOWN_LEFT
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+PRED8x8L_DOWN_LEFT
+%endif
+
+;-----------------------------------------------------------------------------
+; void ff_pred8x8l_down_right_10(pixel *src, int has_topleft,
+; int has_topright, ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+%macro PRED8x8L_DOWN_RIGHT 0
+; standard forbids this when has_topleft is false
+; no need to check
+cglobal pred8x8l_down_right_10, 4, 5, 8
+ sub r0, r3
+ lea r4, [r0+r3*4]
+ lea r1, [r3*3]
+ mova m0, [r0+r3*1-16]
+ punpckhwd m0, [r0+r3*0-16]
+ mova m1, [r0+r1*1-16]
+ punpckhwd m1, [r0+r3*2-16]
+ punpckhdq m1, m0
+ mova m2, [r4+r3*1-16]
+ punpckhwd m2, [r4+r3*0-16]
+ mova m3, [r4+r1*1-16]
+ punpckhwd m3, [r4+r3*2-16]
+ punpckhdq m3, m2
+ punpckhqdq m3, m1
+ mova m0, [r4+r3*4-16]
+ mova m1, [r0]
+ PALIGNR m4, m3, m0, 14, m0
+ PALIGNR m1, m3, 2, m2
+ pslldq m0, m4, 2
+ pshuflw m0, m0, 11100101b
+ PRED4x4_LOWPASS m6, m1, m4, m3
+ PRED4x4_LOWPASS m4, m3, m0, m4
+ mova m3, [r0]
+ shr r2d, 13
+ pslldq m1, m3, 2
+ psrldq m2, m3, 2
+ pinsrw m1, [r0-2], 0
+ pinsrw m2, [r0+r2+14], 7
+ PRED4x4_LOWPASS m3, m2, m1, m3
+ PALIGNR m2, m3, m6, 2, m0
+ PALIGNR m5, m3, m6, 14, m0
+ psrldq m7, m3, 2
+ PRED4x4_LOWPASS m6, m4, m2, m6
+ PRED4x4_LOWPASS m3, m5, m7, m3
+ mova [r4+r3*4], m6
+ PALIGNR m3, m6, 14, m2
+ pslldq m6, 2
+ mova [r0+r3*1], m3
+ PALIGNR m3, m6, 14, m2
+ pslldq m6, 2
+ mova [r0+r3*2], m3
+ PALIGNR m3, m6, 14, m2
+ pslldq m6, 2
+ mova [r0+r1*1], m3
+ PALIGNR m3, m6, 14, m2
+ pslldq m6, 2
+ mova [r0+r3*4], m3
+ PALIGNR m3, m6, 14, m2
+ pslldq m6, 2
+ mova [r4+r3*1], m3
+ PALIGNR m3, m6, 14, m2
+ pslldq m6, 2
+ mova [r4+r3*2], m3
+ PALIGNR m3, m6, 14, m6
+ mova [r4+r1*1], m3
+ RET
+%endmacro
+
+INIT_XMM sse2
+PRED8x8L_DOWN_RIGHT
+INIT_XMM ssse3
+PRED8x8L_DOWN_RIGHT
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+PRED8x8L_DOWN_RIGHT
+%endif
+
+;-----------------------------------------------------------------------------
+; void ff_pred8x8l_vertical_right_10(pixel *src, int has_topleft,
+; int has_topright, ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+%macro PRED8x8L_VERTICAL_RIGHT 0
+; likewise with 8x8l_down_right
+cglobal pred8x8l_vertical_right_10, 4, 5, 7
+ sub r0, r3
+ lea r4, [r0+r3*4]
+ lea r1, [r3*3]
+ mova m0, [r0+r3*1-16]
+ punpckhwd m0, [r0+r3*0-16]
+ mova m1, [r0+r1*1-16]
+ punpckhwd m1, [r0+r3*2-16]
+ punpckhdq m1, m0
+ mova m2, [r4+r3*1-16]
+ punpckhwd m2, [r4+r3*0-16]
+ mova m3, [r4+r1*1-16]
+ punpckhwd m3, [r4+r3*2-16]
+ punpckhdq m3, m2
+ punpckhqdq m3, m1
+ mova m0, [r4+r3*4-16]
+ mova m1, [r0]
+ PALIGNR m4, m3, m0, 14, m0
+ PALIGNR m1, m3, 2, m2
+ PRED4x4_LOWPASS m3, m1, m4, m3
+ mova m2, [r0]
+ shr r2d, 13
+ pslldq m1, m2, 2
+ psrldq m5, m2, 2
+ pinsrw m1, [r0-2], 0
+ pinsrw m5, [r0+r2+14], 7
+ PRED4x4_LOWPASS m2, m5, m1, m2
+ PALIGNR m6, m2, m3, 12, m1
+ PALIGNR m5, m2, m3, 14, m0
+ PRED4x4_LOWPASS m0, m6, m2, m5
+ pavgw m2, m5
+ mova [r0+r3*2], m0
+ mova [r0+r3*1], m2
+ pslldq m6, m3, 4
+ pslldq m1, m3, 2
+ PRED4x4_LOWPASS m1, m3, m6, m1
+ PALIGNR m2, m1, 14, m4
+ mova [r0+r1*1], m2
+ pslldq m1, 2
+ PALIGNR m0, m1, 14, m3
+ mova [r0+r3*4], m0
+ pslldq m1, 2
+ PALIGNR m2, m1, 14, m4
+ mova [r4+r3*1], m2
+ pslldq m1, 2
+ PALIGNR m0, m1, 14, m3
+ mova [r4+r3*2], m0
+ pslldq m1, 2
+ PALIGNR m2, m1, 14, m4
+ mova [r4+r1*1], m2
+ pslldq m1, 2
+ PALIGNR m0, m1, 14, m1
+ mova [r4+r3*4], m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+PRED8x8L_VERTICAL_RIGHT
+INIT_XMM ssse3
+PRED8x8L_VERTICAL_RIGHT
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+PRED8x8L_VERTICAL_RIGHT
+%endif
+
+;-----------------------------------------------------------------------------
+; void ff_pred8x8l_horizontal_up_10(pixel *src, int has_topleft,
+; int has_topright, ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+%macro PRED8x8L_HORIZONTAL_UP 0
+cglobal pred8x8l_horizontal_up_10, 4, 4, 6
+ mova m0, [r0+r3*0-16]
+ punpckhwd m0, [r0+r3*1-16]
+ shr r1d, 14
+ dec r1
+ and r1, r3
+ sub r1, r3
+ mova m4, [r0+r1*1-16]
+ lea r1, [r3*3]
+ lea r2, [r0+r3*4]
+ mova m1, [r0+r3*2-16]
+ punpckhwd m1, [r0+r1*1-16]
+ punpckhdq m0, m1
+ mova m2, [r2+r3*0-16]
+ punpckhwd m2, [r2+r3*1-16]
+ mova m3, [r2+r3*2-16]
+ punpckhwd m3, [r2+r1*1-16]
+ punpckhdq m2, m3
+ punpckhqdq m0, m2
+ PALIGNR m1, m0, m4, 14, m4
+ psrldq m2, m0, 2
+ pshufhw m2, m2, 10100100b
+ PRED4x4_LOWPASS m0, m1, m2, m0
+ psrldq m1, m0, 2
+ psrldq m2, m0, 4
+ pshufhw m1, m1, 10100100b
+ pshufhw m2, m2, 01010100b
+ pavgw m4, m0, m1
+ PRED4x4_LOWPASS m1, m2, m0, m1
+ punpckhwd m5, m4, m1
+ punpcklwd m4, m1
+ mova [r2+r3*0], m5
+ mova [r0+r3*0], m4
+ pshufd m0, m5, 11111001b
+ pshufd m1, m5, 11111110b
+ pshufd m2, m5, 11111111b
+ mova [r2+r3*1], m0
+ mova [r2+r3*2], m1
+ mova [r2+r1*1], m2
+ PALIGNR m2, m5, m4, 4, m0
+ PALIGNR m3, m5, m4, 8, m1
+ PALIGNR m5, m5, m4, 12, m4
+ mova [r0+r3*1], m2
+ mova [r0+r3*2], m3
+ mova [r0+r1*1], m5
+ RET
+%endmacro
+
+INIT_XMM sse2
+PRED8x8L_HORIZONTAL_UP
+INIT_XMM ssse3
+PRED8x8L_HORIZONTAL_UP
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+PRED8x8L_HORIZONTAL_UP
+%endif
+
+
+;-----------------------------------------------------------------------------
+; void ff_pred16x16_vertical_10(pixel *src, ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+%macro MOV16 3-5
+ mova [%1+ 0], %2
+ mova [%1+mmsize], %3
+%if mmsize==8
+ mova [%1+ 16], %4
+ mova [%1+ 24], %5
+%endif
+%endmacro
+
+%macro PRED16x16_VERTICAL 0
+cglobal pred16x16_vertical_10, 2, 3
+ sub r0, r1
+ mov r2d, 8
+ mova m0, [r0+ 0]
+ mova m1, [r0+mmsize]
+%if mmsize==8
+ mova m2, [r0+16]
+ mova m3, [r0+24]
+%endif
+.loop:
+ MOV16 r0+r1*1, m0, m1, m2, m3
+ MOV16 r0+r1*2, m0, m1, m2, m3
+ lea r0, [r0+r1*2]
+ dec r2d
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PRED16x16_VERTICAL
+INIT_XMM sse2
+PRED16x16_VERTICAL
+
+;-----------------------------------------------------------------------------
+; void ff_pred16x16_horizontal_10(pixel *src, ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+%macro PRED16x16_HORIZONTAL 0
+cglobal pred16x16_horizontal_10, 2, 3
+ mov r2d, 8
+.vloop:
+ movd m0, [r0+r1*0-4]
+ movd m1, [r0+r1*1-4]
+ SPLATW m0, m0, 1
+ SPLATW m1, m1, 1
+ MOV16 r0+r1*0, m0, m0, m0, m0
+ MOV16 r0+r1*1, m1, m1, m1, m1
+ lea r0, [r0+r1*2]
+ dec r2d
+ jg .vloop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PRED16x16_HORIZONTAL
+INIT_XMM sse2
+PRED16x16_HORIZONTAL
+
+;-----------------------------------------------------------------------------
+; void ff_pred16x16_dc_10(pixel *src, ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+%macro PRED16x16_DC 0
+cglobal pred16x16_dc_10, 2, 6
+ mov r5, r0
+ sub r0, r1
+ mova m0, [r0+0]
+ paddw m0, [r0+mmsize]
+%if mmsize==8
+ paddw m0, [r0+16]
+ paddw m0, [r0+24]
+%endif
+ HADDW m0, m2
+
+ lea r0, [r0+r1-2]
+ movzx r3d, word [r0]
+ movzx r4d, word [r0+r1]
+%rep 7
+ lea r0, [r0+r1*2]
+ movzx r2d, word [r0]
+ add r3d, r2d
+ movzx r2d, word [r0+r1]
+ add r4d, r2d
+%endrep
+ lea r3d, [r3+r4+16]
+
+ movd m1, r3d
+ paddw m0, m1
+ psrlw m0, 5
+ SPLATW m0, m0
+ mov r3d, 8
+.loop:
+ MOV16 r5+r1*0, m0, m0, m0, m0
+ MOV16 r5+r1*1, m0, m0, m0, m0
+ lea r5, [r5+r1*2]
+ dec r3d
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PRED16x16_DC
+INIT_XMM sse2
+PRED16x16_DC
+
+;-----------------------------------------------------------------------------
+; void ff_pred16x16_top_dc_10(pixel *src, ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+%macro PRED16x16_TOP_DC 0
+cglobal pred16x16_top_dc_10, 2, 3
+ sub r0, r1
+ mova m0, [r0+0]
+ paddw m0, [r0+mmsize]
+%if mmsize==8
+ paddw m0, [r0+16]
+ paddw m0, [r0+24]
+%endif
+ HADDW m0, m2
+
+ SPLATW m0, m0
+ paddw m0, [pw_8]
+ psrlw m0, 4
+ mov r2d, 8
+.loop:
+ MOV16 r0+r1*1, m0, m0, m0, m0
+ MOV16 r0+r1*2, m0, m0, m0, m0
+ lea r0, [r0+r1*2]
+ dec r2d
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PRED16x16_TOP_DC
+INIT_XMM sse2
+PRED16x16_TOP_DC
+
+;-----------------------------------------------------------------------------
+; void ff_pred16x16_left_dc_10(pixel *src, ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+%macro PRED16x16_LEFT_DC 0
+cglobal pred16x16_left_dc_10, 2, 6
+ mov r5, r0
+
+ sub r0, 2
+ movzx r3d, word [r0]
+ movzx r4d, word [r0+r1]
+%rep 7
+ lea r0, [r0+r1*2]
+ movzx r2d, word [r0]
+ add r3d, r2d
+ movzx r2d, word [r0+r1]
+ add r4d, r2d
+%endrep
+ lea r3d, [r3+r4+8]
+ shr r3d, 4
+
+ movd m0, r3d
+ SPLATW m0, m0
+ mov r3d, 8
+.loop:
+ MOV16 r5+r1*0, m0, m0, m0, m0
+ MOV16 r5+r1*1, m0, m0, m0, m0
+ lea r5, [r5+r1*2]
+ dec r3d
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PRED16x16_LEFT_DC
+INIT_XMM sse2
+PRED16x16_LEFT_DC
+
+;-----------------------------------------------------------------------------
+; void ff_pred16x16_128_dc_10(pixel *src, ptrdiff_t stride)
+;-----------------------------------------------------------------------------
+%macro PRED16x16_128_DC 0
+cglobal pred16x16_128_dc_10, 2,3
+ mova m0, [pw_512]
+ mov r2d, 8
+.loop:
+ MOV16 r0+r1*0, m0, m0, m0, m0
+ MOV16 r0+r1*1, m0, m0, m0, m0
+ lea r0, [r0+r1*2]
+ dec r2d
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PRED16x16_128_DC
+INIT_XMM sse2
+PRED16x16_128_DC
diff --git a/libs/ffvpx/libavcodec/x86/h264_intrapred_init.c b/libs/ffvpx/libavcodec/x86/h264_intrapred_init.c
new file mode 100644
index 000000000..bdd5125d6
--- /dev/null
+++ b/libs/ffvpx/libavcodec/x86/h264_intrapred_init.c
@@ -0,0 +1,410 @@
+/*
+ * Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/h264pred.h"
+
+#define PRED4x4(TYPE, DEPTH, OPT) \
+void ff_pred4x4_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
+ const uint8_t *topright, \
+ ptrdiff_t stride);
+
+PRED4x4(dc, 10, mmxext)
+PRED4x4(down_left, 10, sse2)
+PRED4x4(down_left, 10, avx)
+PRED4x4(down_right, 10, sse2)
+PRED4x4(down_right, 10, ssse3)
+PRED4x4(down_right, 10, avx)
+PRED4x4(vertical_left, 10, sse2)
+PRED4x4(vertical_left, 10, avx)
+PRED4x4(vertical_right, 10, sse2)
+PRED4x4(vertical_right, 10, ssse3)
+PRED4x4(vertical_right, 10, avx)
+PRED4x4(horizontal_up, 10, mmxext)
+PRED4x4(horizontal_down, 10, sse2)
+PRED4x4(horizontal_down, 10, ssse3)
+PRED4x4(horizontal_down, 10, avx)
+
+#define PRED8x8(TYPE, DEPTH, OPT) \
+void ff_pred8x8_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
+ ptrdiff_t stride);
+
+PRED8x8(dc, 10, mmxext)
+PRED8x8(dc, 10, sse2)
+PRED8x8(top_dc, 10, sse2)
+PRED8x8(plane, 10, sse2)
+PRED8x8(vertical, 10, sse2)
+PRED8x8(horizontal, 10, sse2)
+
+#define PRED8x8L(TYPE, DEPTH, OPT)\
+void ff_pred8x8l_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
+ int has_topleft, \
+ int has_topright, \
+ ptrdiff_t stride);
+
+PRED8x8L(dc, 10, sse2)
+PRED8x8L(dc, 10, avx)
+PRED8x8L(128_dc, 10, mmxext)
+PRED8x8L(128_dc, 10, sse2)
+PRED8x8L(top_dc, 10, sse2)
+PRED8x8L(top_dc, 10, avx)
+PRED8x8L(vertical, 10, sse2)
+PRED8x8L(vertical, 10, avx)
+PRED8x8L(horizontal, 10, sse2)
+PRED8x8L(horizontal, 10, ssse3)
+PRED8x8L(horizontal, 10, avx)
+PRED8x8L(down_left, 10, sse2)
+PRED8x8L(down_left, 10, ssse3)
+PRED8x8L(down_left, 10, avx)
+PRED8x8L(down_right, 10, sse2)
+PRED8x8L(down_right, 10, ssse3)
+PRED8x8L(down_right, 10, avx)
+PRED8x8L(vertical_right, 10, sse2)
+PRED8x8L(vertical_right, 10, ssse3)
+PRED8x8L(vertical_right, 10, avx)
+PRED8x8L(horizontal_up, 10, sse2)
+PRED8x8L(horizontal_up, 10, ssse3)
+PRED8x8L(horizontal_up, 10, avx)
+
+#define PRED16x16(TYPE, DEPTH, OPT)\
+void ff_pred16x16_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
+ ptrdiff_t stride);
+
+PRED16x16(dc, 10, mmxext)
+PRED16x16(dc, 10, sse2)
+PRED16x16(top_dc, 10, mmxext)
+PRED16x16(top_dc, 10, sse2)
+PRED16x16(128_dc, 10, mmxext)
+PRED16x16(128_dc, 10, sse2)
+PRED16x16(left_dc, 10, mmxext)
+PRED16x16(left_dc, 10, sse2)
+PRED16x16(vertical, 10, mmxext)
+PRED16x16(vertical, 10, sse2)
+PRED16x16(horizontal, 10, mmxext)
+PRED16x16(horizontal, 10, sse2)
+
+/* 8-bit versions */
+PRED16x16(vertical, 8, mmx)
+PRED16x16(vertical, 8, sse)
+PRED16x16(horizontal, 8, mmx)
+PRED16x16(horizontal, 8, mmxext)
+PRED16x16(horizontal, 8, ssse3)
+PRED16x16(dc, 8, mmxext)
+PRED16x16(dc, 8, sse2)
+PRED16x16(dc, 8, ssse3)
+PRED16x16(plane_h264, 8, mmx)
+PRED16x16(plane_h264, 8, mmxext)
+PRED16x16(plane_h264, 8, sse2)
+PRED16x16(plane_h264, 8, ssse3)
+PRED16x16(plane_rv40, 8, mmx)
+PRED16x16(plane_rv40, 8, mmxext)
+PRED16x16(plane_rv40, 8, sse2)
+PRED16x16(plane_rv40, 8, ssse3)
+PRED16x16(plane_svq3, 8, mmx)
+PRED16x16(plane_svq3, 8, mmxext)
+PRED16x16(plane_svq3, 8, sse2)
+PRED16x16(plane_svq3, 8, ssse3)
+PRED16x16(tm_vp8, 8, mmx)
+PRED16x16(tm_vp8, 8, mmxext)
+PRED16x16(tm_vp8, 8, sse2)
+PRED16x16(tm_vp8, 8, avx2)
+
+PRED8x8(top_dc, 8, mmxext)
+PRED8x8(dc_rv40, 8, mmxext)
+PRED8x8(dc, 8, mmxext)
+PRED8x8(vertical, 8, mmx)
+PRED8x8(horizontal, 8, mmx)
+PRED8x8(horizontal, 8, mmxext)
+PRED8x8(horizontal, 8, ssse3)
+PRED8x8(plane, 8, mmx)
+PRED8x8(plane, 8, mmxext)
+PRED8x8(plane, 8, sse2)
+PRED8x8(plane, 8, ssse3)
+PRED8x8(tm_vp8, 8, mmx)
+PRED8x8(tm_vp8, 8, mmxext)
+PRED8x8(tm_vp8, 8, sse2)
+PRED8x8(tm_vp8, 8, ssse3)
+
+PRED8x8L(top_dc, 8, mmxext)
+PRED8x8L(top_dc, 8, ssse3)
+PRED8x8L(dc, 8, mmxext)
+PRED8x8L(dc, 8, ssse3)
+PRED8x8L(horizontal, 8, mmxext)
+PRED8x8L(horizontal, 8, ssse3)
+PRED8x8L(vertical, 8, mmxext)
+PRED8x8L(vertical, 8, ssse3)
+PRED8x8L(down_left, 8, mmxext)
+PRED8x8L(down_left, 8, sse2)
+PRED8x8L(down_left, 8, ssse3)
+PRED8x8L(down_right, 8, mmxext)
+PRED8x8L(down_right, 8, sse2)
+PRED8x8L(down_right, 8, ssse3)
+PRED8x8L(vertical_right, 8, mmxext)
+PRED8x8L(vertical_right, 8, sse2)
+PRED8x8L(vertical_right, 8, ssse3)
+PRED8x8L(vertical_left, 8, sse2)
+PRED8x8L(vertical_left, 8, ssse3)
+PRED8x8L(horizontal_up, 8, mmxext)
+PRED8x8L(horizontal_up, 8, ssse3)
+PRED8x8L(horizontal_down, 8, mmxext)
+PRED8x8L(horizontal_down, 8, sse2)
+PRED8x8L(horizontal_down, 8, ssse3)
+
+PRED4x4(dc, 8, mmxext)
+PRED4x4(down_left, 8, mmxext)
+PRED4x4(down_right, 8, mmxext)
+PRED4x4(vertical_left, 8, mmxext)
+PRED4x4(vertical_right, 8, mmxext)
+PRED4x4(horizontal_up, 8, mmxext)
+PRED4x4(horizontal_down, 8, mmxext)
+PRED4x4(tm_vp8, 8, mmx)
+PRED4x4(tm_vp8, 8, mmxext)
+PRED4x4(tm_vp8, 8, ssse3)
+PRED4x4(vertical_vp8, 8, mmxext)
+
+av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id,
+ const int bit_depth,
+ const int chroma_format_idc)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (bit_depth == 8) {
+ if (EXTERNAL_MMX(cpu_flags)) {
+ h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_8_mmx;
+ h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_mmx;
+ if (chroma_format_idc <= 1) {
+ h->pred8x8 [VERT_PRED8x8 ] = ff_pred8x8_vertical_8_mmx;
+ h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_mmx;
+ }
+ if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) {
+ h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_mmx;
+ h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_mmx;
+ h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_8_mmx;
+ } else {
+ if (chroma_format_idc <= 1)
+ h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_mmx;
+ if (codec_id == AV_CODEC_ID_SVQ3) {
+ if (cpu_flags & AV_CPU_FLAG_CMOV)
+ h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_mmx;
+ } else if (codec_id == AV_CODEC_ID_RV40) {
+ h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_mmx;
+ } else {
+ h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_mmx;
+ }
+ }
+ }
+
+ if (EXTERNAL_MMXEXT(cpu_flags)) {
+ h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_mmxext;
+ h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_mmxext;
+ if (chroma_format_idc <= 1)
+ h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_mmxext;
+ h->pred8x8l [TOP_DC_PRED ] = ff_pred8x8l_top_dc_8_mmxext;
+ h->pred8x8l [DC_PRED ] = ff_pred8x8l_dc_8_mmxext;
+ h->pred8x8l [HOR_PRED ] = ff_pred8x8l_horizontal_8_mmxext;
+ h->pred8x8l [VERT_PRED ] = ff_pred8x8l_vertical_8_mmxext;
+ h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = ff_pred8x8l_down_right_8_mmxext;
+ h->pred8x8l [VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_8_mmxext;
+ h->pred8x8l [HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_8_mmxext;
+ h->pred8x8l [DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_8_mmxext;
+ h->pred8x8l [HOR_DOWN_PRED ] = ff_pred8x8l_horizontal_down_8_mmxext;
+ h->pred4x4 [DIAG_DOWN_RIGHT_PRED ] = ff_pred4x4_down_right_8_mmxext;
+ h->pred4x4 [VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_8_mmxext;
+ h->pred4x4 [HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_8_mmxext;
+ h->pred4x4 [DC_PRED ] = ff_pred4x4_dc_8_mmxext;
+ if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8 ||
+ codec_id == AV_CODEC_ID_H264) {
+ h->pred4x4 [DIAG_DOWN_LEFT_PRED] = ff_pred4x4_down_left_8_mmxext;
+ }
+ if (codec_id == AV_CODEC_ID_SVQ3 || codec_id == AV_CODEC_ID_H264) {
+ h->pred4x4 [VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_8_mmxext;
+ }
+ if (codec_id != AV_CODEC_ID_RV40) {
+ h->pred4x4 [HOR_UP_PRED ] = ff_pred4x4_horizontal_up_8_mmxext;
+ }
+ if (codec_id == AV_CODEC_ID_SVQ3 || codec_id == AV_CODEC_ID_H264) {
+ if (chroma_format_idc <= 1) {
+ h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_8_mmxext;
+ h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_8_mmxext;
+ }
+ }
+ if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) {
+ h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_mmxext;
+ h->pred8x8 [DC_PRED8x8 ] = ff_pred8x8_dc_rv40_8_mmxext;
+ h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_mmxext;
+ h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_8_mmxext;
+ h->pred4x4 [VERT_PRED ] = ff_pred4x4_vertical_vp8_8_mmxext;
+ } else {
+ if (chroma_format_idc <= 1)
+ h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_mmxext;
+ if (codec_id == AV_CODEC_ID_SVQ3) {
+ h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_svq3_8_mmxext;
+ } else if (codec_id == AV_CODEC_ID_RV40) {
+ h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_rv40_8_mmxext;
+ } else {
+ h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_h264_8_mmxext;
+ }
+ }
+ }
+
+ if (EXTERNAL_SSE(cpu_flags)) {
+ h->pred16x16[VERT_PRED8x8] = ff_pred16x16_vertical_8_sse;
+ }
+
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_sse2;
+ h->pred8x8l [DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_8_sse2;
+ h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = ff_pred8x8l_down_right_8_sse2;
+ h->pred8x8l [VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_8_sse2;
+ h->pred8x8l [VERT_LEFT_PRED ] = ff_pred8x8l_vertical_left_8_sse2;
+ h->pred8x8l [HOR_DOWN_PRED ] = ff_pred8x8l_horizontal_down_8_sse2;
+ if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) {
+ h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_sse2;
+ h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_sse2;
+ } else {
+ if (chroma_format_idc <= 1)
+ h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_sse2;
+ if (codec_id == AV_CODEC_ID_SVQ3) {
+ h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_sse2;
+ } else if (codec_id == AV_CODEC_ID_RV40) {
+ h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_sse2;
+ } else {
+ h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_sse2;
+ }
+ }
+ }
+
+ if (EXTERNAL_SSSE3(cpu_flags)) {
+ h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_ssse3;
+ h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_ssse3;
+ if (chroma_format_idc <= 1)
+ h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_ssse3;
+ h->pred8x8l [TOP_DC_PRED ] = ff_pred8x8l_top_dc_8_ssse3;
+ h->pred8x8l [DC_PRED ] = ff_pred8x8l_dc_8_ssse3;
+ h->pred8x8l [HOR_PRED ] = ff_pred8x8l_horizontal_8_ssse3;
+ h->pred8x8l [VERT_PRED ] = ff_pred8x8l_vertical_8_ssse3;
+ h->pred8x8l [DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_8_ssse3;
+ h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = ff_pred8x8l_down_right_8_ssse3;
+ h->pred8x8l [VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_8_ssse3;
+ h->pred8x8l [VERT_LEFT_PRED ] = ff_pred8x8l_vertical_left_8_ssse3;
+ h->pred8x8l [HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_8_ssse3;
+ h->pred8x8l [HOR_DOWN_PRED ] = ff_pred8x8l_horizontal_down_8_ssse3;
+ if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) {
+ h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_ssse3;
+ h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_8_ssse3;
+ } else {
+ if (chroma_format_idc <= 1)
+ h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_ssse3;
+ if (codec_id == AV_CODEC_ID_SVQ3) {
+ h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_ssse3;
+ } else if (codec_id == AV_CODEC_ID_RV40) {
+ h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_ssse3;
+ } else {
+ h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_ssse3;
+ }
+ }
+ }
+
+ if(EXTERNAL_AVX2(cpu_flags)){
+ if (codec_id == AV_CODEC_ID_VP8) {
+ h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_avx2;
+ }
+ }
+ } else if (bit_depth == 10) {
+ if (EXTERNAL_MMXEXT(cpu_flags)) {
+ h->pred4x4[DC_PRED ] = ff_pred4x4_dc_10_mmxext;
+ h->pred4x4[HOR_UP_PRED ] = ff_pred4x4_horizontal_up_10_mmxext;
+
+ if (chroma_format_idc <= 1)
+ h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_mmxext;
+
+ h->pred8x8l[DC_128_PRED ] = ff_pred8x8l_128_dc_10_mmxext;
+
+ h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_10_mmxext;
+ h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_10_mmxext;
+ h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_10_mmxext;
+ h->pred16x16[LEFT_DC_PRED8x8 ] = ff_pred16x16_left_dc_10_mmxext;
+ h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_10_mmxext;
+ h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_10_mmxext;
+ }
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_sse2;
+ h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_sse2;
+ h->pred4x4[VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_10_sse2;
+ h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_sse2;
+ h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_sse2;
+
+ if (chroma_format_idc <= 1) {
+ h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_sse2;
+ h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_10_sse2;
+ h->pred8x8[PLANE_PRED8x8 ] = ff_pred8x8_plane_10_sse2;
+ h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vertical_10_sse2;
+ h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_horizontal_10_sse2;
+ }
+
+ h->pred8x8l[VERT_PRED ] = ff_pred8x8l_vertical_10_sse2;
+ h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_sse2;
+ h->pred8x8l[DC_PRED ] = ff_pred8x8l_dc_10_sse2;
+ h->pred8x8l[DC_128_PRED ] = ff_pred8x8l_128_dc_10_sse2;
+ h->pred8x8l[TOP_DC_PRED ] = ff_pred8x8l_top_dc_10_sse2;
+ h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_sse2;
+ h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_sse2;
+ h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_sse2;
+ h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_sse2;
+
+ h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_10_sse2;
+ h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_10_sse2;
+ h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_10_sse2;
+ h->pred16x16[LEFT_DC_PRED8x8 ] = ff_pred16x16_left_dc_10_sse2;
+ h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_10_sse2;
+ h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_10_sse2;
+ }
+ if (EXTERNAL_SSSE3(cpu_flags)) {
+ h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_ssse3;
+ h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_ssse3;
+ h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_ssse3;
+
+ h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_ssse3;
+ h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_ssse3;
+ h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_ssse3;
+ h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_ssse3;
+ h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_ssse3;
+ }
+ if (EXTERNAL_AVX(cpu_flags)) {
+ h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_avx;
+ h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_avx;
+ h->pred4x4[VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_10_avx;
+ h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_avx;
+ h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_avx;
+
+ h->pred8x8l[VERT_PRED ] = ff_pred8x8l_vertical_10_avx;
+ h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_avx;
+ h->pred8x8l[DC_PRED ] = ff_pred8x8l_dc_10_avx;
+ h->pred8x8l[TOP_DC_PRED ] = ff_pred8x8l_top_dc_10_avx;
+ h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_avx;
+ h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_avx;
+ h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_avx;
+ h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_avx;
+ }
+ }
+}
diff --git a/libs/ffvpx/libavcodec/x86/mathops.h b/libs/ffvpx/libavcodec/x86/mathops.h
new file mode 100644
index 000000000..6298f5ed1
--- /dev/null
+++ b/libs/ffvpx/libavcodec/x86/mathops.h
@@ -0,0 +1,133 @@
+/*
+ * simple math operations
+ * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_MATHOPS_H
+#define AVCODEC_X86_MATHOPS_H
+
+#include "config.h"
+
+#include "libavutil/common.h"
+#include "libavutil/x86/asm.h"
+
+#if HAVE_INLINE_ASM
+
+#if ARCH_X86_32
+
+#define MULL MULL
+static av_always_inline av_const int MULL(int a, int b, unsigned shift)
+{
+ int rt, dummy;
+ __asm__ (
+ "imull %3 \n\t"
+ "shrdl %4, %%edx, %%eax \n\t"
+ :"=a"(rt), "=d"(dummy)
+ :"a"(a), "rm"(b), "ci"((uint8_t)shift)
+ );
+ return rt;
+}
+
+#define MULH MULH
+static av_always_inline av_const int MULH(int a, int b)
+{
+ int rt, dummy;
+ __asm__ (
+ "imull %3"
+ :"=d"(rt), "=a"(dummy)
+ :"a"(a), "rm"(b)
+ );
+ return rt;
+}
+
+#define MUL64 MUL64
+static av_always_inline av_const int64_t MUL64(int a, int b)
+{
+ int64_t rt;
+ __asm__ (
+ "imull %2"
+ :"=A"(rt)
+ :"a"(a), "rm"(b)
+ );
+ return rt;
+}
+
+#endif /* ARCH_X86_32 */
+
+#if HAVE_I686
+/* median of 3 */
+#define mid_pred mid_pred
+static inline av_const int mid_pred(int a, int b, int c)
+{
+ int i=b;
+ __asm__ (
+ "cmp %2, %1 \n\t"
+ "cmovg %1, %0 \n\t"
+ "cmovg %2, %1 \n\t"
+ "cmp %3, %1 \n\t"
+ "cmovl %3, %1 \n\t"
+ "cmp %1, %0 \n\t"
+ "cmovg %1, %0 \n\t"
+ :"+&r"(i), "+&r"(a)
+ :"r"(b), "r"(c)
+ );
+ return i;
+}
+
+#if HAVE_6REGS
+#define COPY3_IF_LT(x, y, a, b, c, d)\
+__asm__ volatile(\
+ "cmpl %0, %3 \n\t"\
+ "cmovl %3, %0 \n\t"\
+ "cmovl %4, %1 \n\t"\
+ "cmovl %5, %2 \n\t"\
+ : "+&r" (x), "+&r" (a), "+r" (c)\
+ : "r" (y), "r" (b), "r" (d)\
+);
+#endif /* HAVE_6REGS */
+
+#endif /* HAVE_I686 */
+
+#define MASK_ABS(mask, level) \
+ __asm__ ("cdq \n\t" \
+ "xorl %1, %0 \n\t" \
+ "subl %1, %0 \n\t" \
+ : "+a"(level), "=&d"(mask))
+
+// avoid +32 for shift optimization (gcc should do that ...)
+#define NEG_SSR32 NEG_SSR32
+static inline int32_t NEG_SSR32( int32_t a, int8_t s){
+ __asm__ ("sarl %1, %0\n\t"
+ : "+r" (a)
+ : "ic" ((uint8_t)(-s))
+ );
+ return a;
+}
+
+#define NEG_USR32 NEG_USR32
+static inline uint32_t NEG_USR32(uint32_t a, int8_t s){
+ __asm__ ("shrl %1, %0\n\t"
+ : "+r" (a)
+ : "ic" ((uint8_t)(-s))
+ );
+ return a;
+}
+
+#endif /* HAVE_INLINE_ASM */
+#endif /* AVCODEC_X86_MATHOPS_H */
diff --git a/libs/ffvpx/libavcodec/x86/moz.build b/libs/ffvpx/libavcodec/x86/moz.build
new file mode 100644
index 000000000..847d40808
--- /dev/null
+++ b/libs/ffvpx/libavcodec/x86/moz.build
@@ -0,0 +1,34 @@
+# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+SOURCES += [
+ 'constants.c',
+ 'flacdsp.asm',
+ 'flacdsp_init.c',
+ 'h264_intrapred.asm',
+ 'h264_intrapred_10bit.asm',
+ 'h264_intrapred_init.c',
+ 'videodsp.asm',
+ 'videodsp_init.c',
+ 'vp8dsp.asm',
+ 'vp8dsp_init.c',
+ 'vp8dsp_loopfilter.asm',
+ 'vp9dsp_init.c',
+ 'vp9dsp_init_10bpp.c',
+ 'vp9dsp_init_12bpp.c',
+ 'vp9dsp_init_16bpp.c',
+ 'vp9intrapred.asm',
+ 'vp9intrapred_16bpp.asm',
+ 'vp9itxfm.asm',
+ 'vp9itxfm_16bpp.asm',
+ 'vp9lpf.asm',
+ 'vp9lpf_16bpp.asm',
+ 'vp9mc.asm',
+ 'vp9mc_16bpp.asm',
+]
+
+FINAL_LIBRARY = 'mozavcodec'
+
+include('/libs/ffvpx/ffvpxcommon.mozbuild')
diff --git a/libs/ffvpx/libavcodec/x86/videodsp.asm b/libs/ffvpx/libavcodec/x86/videodsp.asm
new file mode 100644
index 000000000..e23786070
--- /dev/null
+++ b/libs/ffvpx/libavcodec/x86/videodsp.asm
@@ -0,0 +1,468 @@
+;******************************************************************************
+;* Core video DSP functions
+;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+; slow vertical extension loop function. Works with variable-width, and
+; does per-line reading/writing of source data
+
+%macro V_COPY_ROW 2 ; type (top/body/bottom), h
+.%1_y_loop: ; do {
+ mov wq, r7mp ; initialize w (r7mp = wmp)
+.%1_x_loop: ; do {
+ movu m0, [srcq+wq] ; m0 = read($mmsize)
+ movu [dstq+wq], m0 ; write(m0, $mmsize)
+ add wq, mmsize ; w -= $mmsize
+ cmp wq, -mmsize ; } while (w > $mmsize);
+ jl .%1_x_loop
+ movu m0, [srcq-mmsize] ; m0 = read($mmsize)
+ movu [dstq-mmsize], m0 ; write(m0, $mmsize)
+%ifidn %1, body ; if ($type == body) {
+ add srcq, src_strideq ; src += src_stride
+%endif ; }
+ add dstq, dst_strideq ; dst += dst_stride
+ dec %2 ; } while (--$h);
+ jnz .%1_y_loop
+%endmacro
+
+%macro vvar_fn 0
+; .----. <- zero
+; | | <- top is copied from first line in body of source
+; |----| <- start_y
+; | | <- body is copied verbatim (line-by-line) from source
+; |----| <- end_y
+; | | <- bottom is copied from last line in body of source
+; '----' <- bh
+%if ARCH_X86_64
+cglobal emu_edge_vvar, 7, 8, 1, dst, dst_stride, src, src_stride, \
+ start_y, end_y, bh, w
+%else ; x86-32
+cglobal emu_edge_vvar, 1, 6, 1, dst, src, start_y, end_y, bh, w
+%define src_strideq r3mp
+%define dst_strideq r1mp
+ mov srcq, r2mp
+ mov start_yq, r4mp
+ mov end_yq, r5mp
+ mov bhq, r6mp
+%endif
+ sub bhq, end_yq ; bh -= end_q
+ sub end_yq, start_yq ; end_q -= start_q
+ add srcq, r7mp ; (r7mp = wmp)
+ add dstq, r7mp ; (r7mp = wmp)
+ neg r7mp ; (r7mp = wmp)
+ test start_yq, start_yq ; if (start_q) {
+ jz .body
+ V_COPY_ROW top, start_yq ; v_copy_row(top, start_yq)
+.body: ; }
+ V_COPY_ROW body, end_yq ; v_copy_row(body, end_yq)
+ test bhq, bhq ; if (bh) {
+ jz .end
+ sub srcq, src_strideq ; src -= src_stride
+ V_COPY_ROW bottom, bhq ; v_copy_row(bottom, bh)
+.end: ; }
+ RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+vvar_fn
+%endif
+
+INIT_XMM sse
+vvar_fn
+
+%macro hvar_fn 0
+cglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w
+ lea dstq, [dstq+n_wordsq*2]
+ neg n_wordsq
+ lea start_xq, [start_xq+n_wordsq*2]
+.y_loop: ; do {
+%if cpuflag(avx2)
+ vpbroadcastb m0, [dstq+start_xq]
+ mov wq, n_wordsq ; initialize w
+%else
+ movzx wd, byte [dstq+start_xq] ; w = read(1)
+ imul wd, 0x01010101 ; w *= 0x01010101
+ movd m0, wd
+ mov wq, n_wordsq ; initialize w
+%if cpuflag(sse2)
+ pshufd m0, m0, q0000 ; splat
+%else ; mmx
+ punpckldq m0, m0 ; splat
+%endif ; mmx/sse
+%endif ; avx2
+.x_loop: ; do {
+ movu [dstq+wq*2], m0 ; write($reg, $mmsize)
+ add wq, mmsize/2 ; w -= $mmsize/2
+ cmp wq, -(mmsize/2) ; } while (w > $mmsize/2)
+ jl .x_loop
+ movu [dstq-mmsize], m0 ; write($reg, $mmsize)
+ add dstq, dst_strideq ; dst += dst_stride
+ dec hq ; } while (h--)
+ jnz .y_loop
+ RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+hvar_fn
+%endif
+
+INIT_XMM sse2
+hvar_fn
+
+%if HAVE_AVX2_EXTERNAL
+INIT_XMM avx2
+hvar_fn
+%endif
+
+; macro to read/write a horizontal number of pixels (%2) to/from registers
+; on sse, - fills xmm0-15 for consecutive sets of 16 pixels
+; - if (%2 & 8) fills 8 bytes into xmm$next
+; - if (%2 & 4) fills 4 bytes into xmm$next
+; - if (%2 & 3) fills 1, 2 or 4 bytes in eax
+; on mmx, - fills mm0-7 for consecutive sets of 8 pixels
+; - if (%2 & 4) fills 4 bytes into mm$next
+; - if (%2 & 3) fills 1, 2 or 4 bytes in eax
+; writing data out is in the same way
+%macro READ_NUM_BYTES 2
+%assign %%off 0 ; offset in source buffer
+%assign %%mmx_idx 0 ; mmx register index
+%assign %%xmm_idx 0 ; xmm register index
+
+%rep %2/mmsize
+%if mmsize == 16
+ movu xmm %+ %%xmm_idx, [srcq+%%off]
+%assign %%xmm_idx %%xmm_idx+1
+%else ; mmx
+ movu mm %+ %%mmx_idx, [srcq+%%off]
+%assign %%mmx_idx %%mmx_idx+1
+%endif
+%assign %%off %%off+mmsize
+%endrep ; %2/mmsize
+
+%if mmsize == 16
+%if (%2-%%off) >= 8
+%if %2 > 16 && (%2-%%off) > 8
+ movu xmm %+ %%xmm_idx, [srcq+%2-16]
+%assign %%xmm_idx %%xmm_idx+1
+%assign %%off %2
+%else
+ movq mm %+ %%mmx_idx, [srcq+%%off]
+%assign %%mmx_idx %%mmx_idx+1
+%assign %%off %%off+8
+%endif
+%endif ; (%2-%%off) >= 8
+%endif
+
+%if (%2-%%off) >= 4
+%if %2 > 8 && (%2-%%off) > 4
+ movq mm %+ %%mmx_idx, [srcq+%2-8]
+%assign %%off %2
+%else
+ movd mm %+ %%mmx_idx, [srcq+%%off]
+%assign %%off %%off+4
+%endif
+%assign %%mmx_idx %%mmx_idx+1
+%endif ; (%2-%%off) >= 4
+
+%if (%2-%%off) >= 1
+%if %2 >= 4
+ movd mm %+ %%mmx_idx, [srcq+%2-4]
+%elif (%2-%%off) == 1
+ mov valb, [srcq+%2-1]
+%elif (%2-%%off) == 2
+ mov valw, [srcq+%2-2]
+%else
+ mov valb, [srcq+%2-1]
+ ror vald, 16
+ mov valw, [srcq+%2-3]
+%endif
+%endif ; (%2-%%off) >= 1
+%endmacro ; READ_NUM_BYTES
+
+%macro WRITE_NUM_BYTES 2
+%assign %%off 0 ; offset in destination buffer
+%assign %%mmx_idx 0 ; mmx register index
+%assign %%xmm_idx 0 ; xmm register index
+
+%rep %2/mmsize
+%if mmsize == 16
+ movu [dstq+%%off], xmm %+ %%xmm_idx
+%assign %%xmm_idx %%xmm_idx+1
+%else ; mmx
+ movu [dstq+%%off], mm %+ %%mmx_idx
+%assign %%mmx_idx %%mmx_idx+1
+%endif
+%assign %%off %%off+mmsize
+%endrep ; %2/mmsize
+
+%if mmsize == 16
+%if (%2-%%off) >= 8
+%if %2 > 16 && (%2-%%off) > 8
+ movu [dstq+%2-16], xmm %+ %%xmm_idx
+%assign %%xmm_idx %%xmm_idx+1
+%assign %%off %2
+%else
+ movq [dstq+%%off], mm %+ %%mmx_idx
+%assign %%mmx_idx %%mmx_idx+1
+%assign %%off %%off+8
+%endif
+%endif ; (%2-%%off) >= 8
+%endif
+
+%if (%2-%%off) >= 4
+%if %2 > 8 && (%2-%%off) > 4
+ movq [dstq+%2-8], mm %+ %%mmx_idx
+%assign %%off %2
+%else
+ movd [dstq+%%off], mm %+ %%mmx_idx
+%assign %%off %%off+4
+%endif
+%assign %%mmx_idx %%mmx_idx+1
+%endif ; (%2-%%off) >= 4
+
+%if (%2-%%off) >= 1
+%if %2 >= 4
+ movd [dstq+%2-4], mm %+ %%mmx_idx
+%elif (%2-%%off) == 1
+ mov [dstq+%2-1], valb
+%elif (%2-%%off) == 2
+ mov [dstq+%2-2], valw
+%else
+ mov [dstq+%2-3], valw
+ ror vald, 16
+ mov [dstq+%2-1], valb
+%ifnidn %1, body
+ ror vald, 16
+%endif
+%endif
+%endif ; (%2-%%off) >= 1
+%endmacro ; WRITE_NUM_BYTES
+
+; vertical top/bottom extend and body copy fast loops
+; these are function pointers to set-width line copy functions, i.e.
+; they read a fixed number of pixels into set registers, and write
+; those out into the destination buffer
+%macro VERTICAL_EXTEND 2
+%assign %%n %1
+%rep 1+%2-%1
+%if %%n <= 3
+%if ARCH_X86_64
+cglobal emu_edge_vfix %+ %%n, 6, 8, 0, dst, dst_stride, src, src_stride, \
+ start_y, end_y, val, bh
+ mov bhq, r6mp ; r6mp = bhmp
+%else ; x86-32
+cglobal emu_edge_vfix %+ %%n, 0, 6, 0, val, dst, src, start_y, end_y, bh
+ mov dstq, r0mp
+ mov srcq, r2mp
+ mov start_yq, r4mp
+ mov end_yq, r5mp
+ mov bhq, r6mp
+%define dst_strideq r1mp
+%define src_strideq r3mp
+%endif ; x86-64/32
+%else
+%if ARCH_X86_64
+cglobal emu_edge_vfix %+ %%n, 7, 7, 1, dst, dst_stride, src, src_stride, \
+ start_y, end_y, bh
+%else ; x86-32
+cglobal emu_edge_vfix %+ %%n, 1, 5, 1, dst, src, start_y, end_y, bh
+ mov srcq, r2mp
+ mov start_yq, r4mp
+ mov end_yq, r5mp
+ mov bhq, r6mp
+%define dst_strideq r1mp
+%define src_strideq r3mp
+%endif ; x86-64/32
+%endif
+ ; FIXME move this to c wrapper?
+ sub bhq, end_yq ; bh -= end_y
+ sub end_yq, start_yq ; end_y -= start_y
+
+ ; extend pixels above body
+ test start_yq, start_yq ; if (start_y) {
+ jz .body_loop
+ READ_NUM_BYTES top, %%n ; $variable_regs = read($n)
+.top_loop: ; do {
+ WRITE_NUM_BYTES top, %%n ; write($variable_regs, $n)
+ add dstq, dst_strideq ; dst += linesize
+ dec start_yq ; } while (--start_y)
+ jnz .top_loop ; }
+
+ ; copy body pixels
+.body_loop: ; do {
+ READ_NUM_BYTES body, %%n ; $variable_regs = read($n)
+ WRITE_NUM_BYTES body, %%n ; write($variable_regs, $n)
+ add dstq, dst_strideq ; dst += dst_stride
+ add srcq, src_strideq ; src += src_stride
+ dec end_yq ; } while (--end_y)
+ jnz .body_loop
+
+ ; copy bottom pixels
+ test bhq, bhq ; if (block_h) {
+ jz .end
+ sub srcq, src_strideq ; src -= linesize
+ READ_NUM_BYTES bottom, %%n ; $variable_regs = read($n)
+.bottom_loop: ; do {
+ WRITE_NUM_BYTES bottom, %%n ; write($variable_regs, $n)
+ add dstq, dst_strideq ; dst += linesize
+ dec bhq ; } while (--bh)
+ jnz .bottom_loop ; }
+
+.end:
+ RET
+%assign %%n %%n+1
+%endrep ; 1+%2-%1
+%endmacro ; VERTICAL_EXTEND
+
+INIT_MMX mmx
+VERTICAL_EXTEND 1, 15
+%if ARCH_X86_32
+VERTICAL_EXTEND 16, 22
+%endif
+
+INIT_XMM sse
+VERTICAL_EXTEND 16, 22
+
+; left/right (horizontal) fast extend functions
+; these are essentially identical to the vertical extend ones above,
+; just left/right separated because number of pixels to extend is
+; obviously not the same on both sides.
+
+%macro READ_V_PIXEL 2
+%if cpuflag(avx2)
+ vpbroadcastb m0, %2
+%else
+ movzx vald, byte %2
+ imul vald, 0x01010101
+%if %1 >= 8
+ movd m0, vald
+%if mmsize == 16
+ pshufd m0, m0, q0000
+%else
+ punpckldq m0, m0
+%endif ; mmsize == 16
+%endif ; %1 > 16
+%endif ; avx2
+%endmacro ; READ_V_PIXEL
+
+%macro WRITE_V_PIXEL 2
+%assign %%off 0
+
+%if %1 >= 8
+
+%rep %1/mmsize
+ movu [%2+%%off], m0
+%assign %%off %%off+mmsize
+%endrep ; %1/mmsize
+
+%if mmsize == 16
+%if %1-%%off >= 8
+%if %1 > 16 && %1-%%off > 8
+ movu [%2+%1-16], m0
+%assign %%off %1
+%else
+ movq [%2+%%off], m0
+%assign %%off %%off+8
+%endif
+%endif ; %1-%%off >= 8
+%endif ; mmsize == 16
+
+%if %1-%%off >= 4
+%if %1 > 8 && %1-%%off > 4
+ movq [%2+%1-8], m0
+%assign %%off %1
+%else
+ movd [%2+%%off], m0
+%assign %%off %%off+4
+%endif
+%endif ; %1-%%off >= 4
+
+%else ; %1 < 8
+
+%rep %1/4
+ mov [%2+%%off], vald
+%assign %%off %%off+4
+%endrep ; %1/4
+
+%endif ; %1 >=/< 8
+
+%if %1-%%off == 2
+%if cpuflag(avx2)
+ movd [%2+%%off-2], m0
+%else
+ mov [%2+%%off], valw
+%endif ; avx2
+%endif ; (%1-%%off)/2
+%endmacro ; WRITE_V_PIXEL
+
+%macro H_EXTEND 2
+%assign %%n %1
+%rep 1+(%2-%1)/2
+%if cpuflag(avx2)
+cglobal emu_edge_hfix %+ %%n, 4, 4, 1, dst, dst_stride, start_x, bh
+%else
+cglobal emu_edge_hfix %+ %%n, 4, 5, 1, dst, dst_stride, start_x, bh, val
+%endif
+.loop_y: ; do {
+ READ_V_PIXEL %%n, [dstq+start_xq] ; $variable_regs = read($n)
+ WRITE_V_PIXEL %%n, dstq ; write($variable_regs, $n)
+ add dstq, dst_strideq ; dst += dst_stride
+ dec bhq ; } while (--bh)
+ jnz .loop_y
+ RET
+%assign %%n %%n+2
+%endrep ; 1+(%2-%1)/2
+%endmacro ; H_EXTEND
+
+INIT_MMX mmx
+H_EXTEND 2, 14
+%if ARCH_X86_32
+H_EXTEND 16, 22
+%endif
+
+INIT_XMM sse2
+H_EXTEND 16, 22
+
+%if HAVE_AVX2_EXTERNAL
+INIT_XMM avx2
+H_EXTEND 8, 22
+%endif
+
+%macro PREFETCH_FN 1
+cglobal prefetch, 3, 3, 0, buf, stride, h
+.loop:
+ %1 [bufq]
+ add bufq, strideq
+ dec hd
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PREFETCH_FN prefetcht0
+%if ARCH_X86_32
+INIT_MMX 3dnow
+PREFETCH_FN prefetch
+%endif
diff --git a/libs/ffvpx/libavcodec/x86/videodsp_init.c b/libs/ffvpx/libavcodec/x86/videodsp_init.c
new file mode 100644
index 000000000..eeebb4154
--- /dev/null
+++ b/libs/ffvpx/libavcodec/x86/videodsp_init.c
@@ -0,0 +1,309 @@
+/*
+ * Copyright (C) 2002-2012 Michael Niedermayer
+ * Copyright (C) 2012 Ronald S. Bultje
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
+#include "libavutil/common.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/videodsp.h"
+
+#if HAVE_X86ASM
+typedef void emu_edge_vfix_func(uint8_t *dst, x86_reg dst_stride,
+ const uint8_t *src, x86_reg src_stride,
+ x86_reg start_y, x86_reg end_y, x86_reg bh);
+typedef void emu_edge_vvar_func(uint8_t *dst, x86_reg dst_stride,
+ const uint8_t *src, x86_reg src_stride,
+ x86_reg start_y, x86_reg end_y, x86_reg bh,
+ x86_reg w);
+
+extern emu_edge_vfix_func ff_emu_edge_vfix1_mmx;
+extern emu_edge_vfix_func ff_emu_edge_vfix2_mmx;
+extern emu_edge_vfix_func ff_emu_edge_vfix3_mmx;
+extern emu_edge_vfix_func ff_emu_edge_vfix4_mmx;
+extern emu_edge_vfix_func ff_emu_edge_vfix5_mmx;
+extern emu_edge_vfix_func ff_emu_edge_vfix6_mmx;
+extern emu_edge_vfix_func ff_emu_edge_vfix7_mmx;
+extern emu_edge_vfix_func ff_emu_edge_vfix8_mmx;
+extern emu_edge_vfix_func ff_emu_edge_vfix9_mmx;
+extern emu_edge_vfix_func ff_emu_edge_vfix10_mmx;
+extern emu_edge_vfix_func ff_emu_edge_vfix11_mmx;
+extern emu_edge_vfix_func ff_emu_edge_vfix12_mmx;
+extern emu_edge_vfix_func ff_emu_edge_vfix13_mmx;
+extern emu_edge_vfix_func ff_emu_edge_vfix14_mmx;
+extern emu_edge_vfix_func ff_emu_edge_vfix15_mmx;
+extern emu_edge_vfix_func ff_emu_edge_vfix16_mmx;
+extern emu_edge_vfix_func ff_emu_edge_vfix17_mmx;
+extern emu_edge_vfix_func ff_emu_edge_vfix18_mmx;
+extern emu_edge_vfix_func ff_emu_edge_vfix19_mmx;
+extern emu_edge_vfix_func ff_emu_edge_vfix20_mmx;
+extern emu_edge_vfix_func ff_emu_edge_vfix21_mmx;
+extern emu_edge_vfix_func ff_emu_edge_vfix22_mmx;
+#if ARCH_X86_32
+static emu_edge_vfix_func * const vfixtbl_mmx[22] = {
+ &ff_emu_edge_vfix1_mmx, &ff_emu_edge_vfix2_mmx, &ff_emu_edge_vfix3_mmx,
+ &ff_emu_edge_vfix4_mmx, &ff_emu_edge_vfix5_mmx, &ff_emu_edge_vfix6_mmx,
+ &ff_emu_edge_vfix7_mmx, &ff_emu_edge_vfix8_mmx, &ff_emu_edge_vfix9_mmx,
+ &ff_emu_edge_vfix10_mmx, &ff_emu_edge_vfix11_mmx, &ff_emu_edge_vfix12_mmx,
+ &ff_emu_edge_vfix13_mmx, &ff_emu_edge_vfix14_mmx, &ff_emu_edge_vfix15_mmx,
+ &ff_emu_edge_vfix16_mmx, &ff_emu_edge_vfix17_mmx, &ff_emu_edge_vfix18_mmx,
+ &ff_emu_edge_vfix19_mmx, &ff_emu_edge_vfix20_mmx, &ff_emu_edge_vfix21_mmx,
+ &ff_emu_edge_vfix22_mmx
+};
+#endif
+extern emu_edge_vvar_func ff_emu_edge_vvar_mmx;
+extern emu_edge_vfix_func ff_emu_edge_vfix16_sse;
+extern emu_edge_vfix_func ff_emu_edge_vfix17_sse;
+extern emu_edge_vfix_func ff_emu_edge_vfix18_sse;
+extern emu_edge_vfix_func ff_emu_edge_vfix19_sse;
+extern emu_edge_vfix_func ff_emu_edge_vfix20_sse;
+extern emu_edge_vfix_func ff_emu_edge_vfix21_sse;
+extern emu_edge_vfix_func ff_emu_edge_vfix22_sse;
+static emu_edge_vfix_func * const vfixtbl_sse[22] = {
+ ff_emu_edge_vfix1_mmx, ff_emu_edge_vfix2_mmx, ff_emu_edge_vfix3_mmx,
+ ff_emu_edge_vfix4_mmx, ff_emu_edge_vfix5_mmx, ff_emu_edge_vfix6_mmx,
+ ff_emu_edge_vfix7_mmx, ff_emu_edge_vfix8_mmx, ff_emu_edge_vfix9_mmx,
+ ff_emu_edge_vfix10_mmx, ff_emu_edge_vfix11_mmx, ff_emu_edge_vfix12_mmx,
+ ff_emu_edge_vfix13_mmx, ff_emu_edge_vfix14_mmx, ff_emu_edge_vfix15_mmx,
+ ff_emu_edge_vfix16_sse, ff_emu_edge_vfix17_sse, ff_emu_edge_vfix18_sse,
+ ff_emu_edge_vfix19_sse, ff_emu_edge_vfix20_sse, ff_emu_edge_vfix21_sse,
+ ff_emu_edge_vfix22_sse
+};
+extern emu_edge_vvar_func ff_emu_edge_vvar_sse;
+
+typedef void emu_edge_hfix_func(uint8_t *dst, x86_reg dst_stride,
+ x86_reg start_x, x86_reg bh);
+typedef void emu_edge_hvar_func(uint8_t *dst, x86_reg dst_stride,
+ x86_reg start_x, x86_reg n_words, x86_reg bh);
+
+extern emu_edge_hfix_func ff_emu_edge_hfix2_mmx;
+extern emu_edge_hfix_func ff_emu_edge_hfix4_mmx;
+extern emu_edge_hfix_func ff_emu_edge_hfix6_mmx;
+extern emu_edge_hfix_func ff_emu_edge_hfix8_mmx;
+extern emu_edge_hfix_func ff_emu_edge_hfix10_mmx;
+extern emu_edge_hfix_func ff_emu_edge_hfix12_mmx;
+extern emu_edge_hfix_func ff_emu_edge_hfix14_mmx;
+extern emu_edge_hfix_func ff_emu_edge_hfix16_mmx;
+extern emu_edge_hfix_func ff_emu_edge_hfix18_mmx;
+extern emu_edge_hfix_func ff_emu_edge_hfix20_mmx;
+extern emu_edge_hfix_func ff_emu_edge_hfix22_mmx;
+#if ARCH_X86_32
+static emu_edge_hfix_func * const hfixtbl_mmx[11] = {
+ ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmx, ff_emu_edge_hfix6_mmx,
+ ff_emu_edge_hfix8_mmx, ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx,
+ ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_mmx, ff_emu_edge_hfix18_mmx,
+ ff_emu_edge_hfix20_mmx, ff_emu_edge_hfix22_mmx
+};
+#endif
+extern emu_edge_hvar_func ff_emu_edge_hvar_mmx;
+extern emu_edge_hfix_func ff_emu_edge_hfix16_sse2;
+extern emu_edge_hfix_func ff_emu_edge_hfix18_sse2;
+extern emu_edge_hfix_func ff_emu_edge_hfix20_sse2;
+extern emu_edge_hfix_func ff_emu_edge_hfix22_sse2;
+static emu_edge_hfix_func * const hfixtbl_sse2[11] = {
+ ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmx, ff_emu_edge_hfix6_mmx,
+ ff_emu_edge_hfix8_mmx, ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx,
+ ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_sse2, ff_emu_edge_hfix18_sse2,
+ ff_emu_edge_hfix20_sse2, ff_emu_edge_hfix22_sse2
+};
+extern emu_edge_hvar_func ff_emu_edge_hvar_sse2;
+#if HAVE_AVX2_EXTERNAL
+extern emu_edge_hfix_func ff_emu_edge_hfix8_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix10_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix12_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix14_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix16_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix18_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix20_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix22_avx2;
+static emu_edge_hfix_func * const hfixtbl_avx2[11] = {
+ ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmx, ff_emu_edge_hfix6_mmx,
+ ff_emu_edge_hfix8_avx2, ff_emu_edge_hfix10_avx2, ff_emu_edge_hfix12_avx2,
+ ff_emu_edge_hfix14_avx2, ff_emu_edge_hfix16_avx2, ff_emu_edge_hfix18_avx2,
+ ff_emu_edge_hfix20_avx2, ff_emu_edge_hfix22_avx2
+};
+extern emu_edge_hvar_func ff_emu_edge_hvar_avx2;
+#endif
+
+static av_always_inline void emulated_edge_mc(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride,
+ ptrdiff_t src_stride,
+ x86_reg block_w, x86_reg block_h,
+ x86_reg src_x, x86_reg src_y,
+ x86_reg w, x86_reg h,
+ emu_edge_vfix_func * const *vfix_tbl,
+ emu_edge_vvar_func *v_extend_var,
+ emu_edge_hfix_func * const *hfix_tbl,
+ emu_edge_hvar_func *h_extend_var)
+{
+ x86_reg start_y, start_x, end_y, end_x, src_y_add = 0, p;
+
+ if (!w || !h)
+ return;
+
+ av_assert2(block_w <= FFABS(dst_stride));
+
+ if (src_y >= h) {
+ src -= src_y*src_stride;
+ src_y_add = h - 1;
+ src_y = h - 1;
+ } else if (src_y <= -block_h) {
+ src -= src_y*src_stride;
+ src_y_add = 1 - block_h;
+ src_y = 1 - block_h;
+ }
+ if (src_x >= w) {
+ src += w - 1 - src_x;
+ src_x = w - 1;
+ } else if (src_x <= -block_w) {
+ src += 1 - block_w - src_x;
+ src_x = 1 - block_w;
+ }
+
+ start_y = FFMAX(0, -src_y);
+ start_x = FFMAX(0, -src_x);
+ end_y = FFMIN(block_h, h-src_y);
+ end_x = FFMIN(block_w, w-src_x);
+ av_assert2(start_x < end_x && block_w > 0);
+ av_assert2(start_y < end_y && block_h > 0);
+
+ // fill in the to-be-copied part plus all above/below
+ src += (src_y_add + start_y) * src_stride + start_x;
+ w = end_x - start_x;
+ if (w <= 22) {
+ vfix_tbl[w - 1](dst + start_x, dst_stride, src, src_stride,
+ start_y, end_y, block_h);
+ } else {
+ v_extend_var(dst + start_x, dst_stride, src, src_stride,
+ start_y, end_y, block_h, w);
+ }
+
+ // fill left
+ if (start_x) {
+ if (start_x <= 22) {
+ hfix_tbl[(start_x - 1) >> 1](dst, dst_stride, start_x, block_h);
+ } else {
+ h_extend_var(dst, dst_stride,
+ start_x, (start_x + 1) >> 1, block_h);
+ }
+ }
+
+ // fill right
+ p = block_w - end_x;
+ if (p) {
+ if (p <= 22) {
+ hfix_tbl[(p - 1) >> 1](dst + end_x - (p & 1), dst_stride,
+ -!(p & 1), block_h);
+ } else {
+ h_extend_var(dst + end_x - (p & 1), dst_stride,
+ -!(p & 1), (p + 1) >> 1, block_h);
+ }
+ }
+}
+
+#if ARCH_X86_32
+static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src,
+ ptrdiff_t buf_stride,
+ ptrdiff_t src_stride,
+ int block_w, int block_h,
+ int src_x, int src_y, int w, int h)
+{
+ emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
+ src_x, src_y, w, h, vfixtbl_mmx, &ff_emu_edge_vvar_mmx,
+ hfixtbl_mmx, &ff_emu_edge_hvar_mmx);
+}
+
+static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src,
+ ptrdiff_t buf_stride,
+ ptrdiff_t src_stride,
+ int block_w, int block_h,
+ int src_x, int src_y, int w, int h)
+{
+ emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
+ src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
+ hfixtbl_mmx, &ff_emu_edge_hvar_mmx);
+}
+#endif
+
+static av_noinline void emulated_edge_mc_sse2(uint8_t *buf, const uint8_t *src,
+ ptrdiff_t buf_stride,
+ ptrdiff_t src_stride,
+ int block_w, int block_h,
+ int src_x, int src_y, int w,
+ int h)
+{
+ emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
+ src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
+ hfixtbl_sse2, &ff_emu_edge_hvar_sse2);
+}
+
+#if HAVE_AVX2_EXTERNAL
+static av_noinline void emulated_edge_mc_avx2(uint8_t *buf, const uint8_t *src,
+ ptrdiff_t buf_stride,
+ ptrdiff_t src_stride,
+ int block_w, int block_h,
+ int src_x, int src_y, int w,
+ int h)
+{
+ emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
+ src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
+ hfixtbl_avx2, &ff_emu_edge_hvar_avx2);
+}
+#endif /* HAVE_AVX2_EXTERNAL */
+#endif /* HAVE_X86ASM */
+
+void ff_prefetch_mmxext(uint8_t *buf, ptrdiff_t stride, int h);
+void ff_prefetch_3dnow(uint8_t *buf, ptrdiff_t stride, int h);
+
+av_cold void ff_videodsp_init_x86(VideoDSPContext *ctx, int bpc)
+{
+#if HAVE_X86ASM
+ int cpu_flags = av_get_cpu_flags();
+
+#if ARCH_X86_32
+ if (EXTERNAL_MMX(cpu_flags) && bpc <= 8) {
+ ctx->emulated_edge_mc = emulated_edge_mc_mmx;
+ }
+ if (EXTERNAL_AMD3DNOW(cpu_flags)) {
+ ctx->prefetch = ff_prefetch_3dnow;
+ }
+#endif /* ARCH_X86_32 */
+ if (EXTERNAL_MMXEXT(cpu_flags)) {
+ ctx->prefetch = ff_prefetch_mmxext;
+ }
+#if ARCH_X86_32
+ if (EXTERNAL_SSE(cpu_flags) && bpc <= 8) {
+ ctx->emulated_edge_mc = emulated_edge_mc_sse;
+ }
+#endif /* ARCH_X86_32 */
+ if (EXTERNAL_SSE2(cpu_flags) && bpc <= 8) {
+ ctx->emulated_edge_mc = emulated_edge_mc_sse2;
+ }
+#if HAVE_AVX2_EXTERNAL
+ if (EXTERNAL_AVX2(cpu_flags) && bpc <= 8) {
+ ctx->emulated_edge_mc = emulated_edge_mc_avx2;
+ }
+#endif
+#endif /* HAVE_X86ASM */
+}
diff --git a/libs/ffvpx/libavcodec/x86/vp56_arith.h b/libs/ffvpx/libavcodec/x86/vp56_arith.h
new file mode 100644
index 000000000..810cc8dcd
--- /dev/null
+++ b/libs/ffvpx/libavcodec/x86/vp56_arith.h
@@ -0,0 +1,51 @@
+/**
+ * VP5 and VP6 compatible video decoder (arith decoder)
+ *
+ * Copyright (C) 2006 Aurelien Jacobs <aurel@gnuage.org>
+ * Copyright (C) 2010 Eli Friedman
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_VP56_ARITH_H
+#define AVCODEC_X86_VP56_ARITH_H
+
+#if HAVE_INLINE_ASM && HAVE_FAST_CMOV && HAVE_6REGS
+#define vp56_rac_get_prob vp56_rac_get_prob
+static av_always_inline int vp56_rac_get_prob(VP56RangeCoder *c, uint8_t prob)
+{
+ unsigned int code_word = vp56_rac_renorm(c);
+ unsigned int low = 1 + (((c->high - 1) * prob) >> 8);
+ unsigned int low_shift = low << 16;
+ int bit = 0;
+ c->code_word = code_word;
+
+ __asm__(
+ "subl %4, %1 \n\t"
+ "subl %3, %2 \n\t"
+ "setae %b0 \n\t"
+ "cmovb %4, %1 \n\t"
+ "cmovb %5, %2 \n\t"
+ : "+q"(bit), "+&r"(c->high), "+&r"(c->code_word)
+ : "r"(low_shift), "r"(low), "r"(code_word)
+ );
+
+ return bit;
+}
+#endif
+
+#endif /* AVCODEC_X86_VP56_ARITH_H */
diff --git a/libs/ffvpx/libavcodec/x86/vp8dsp.asm b/libs/ffvpx/libavcodec/x86/vp8dsp.asm
new file mode 100644
index 000000000..75de5690a
--- /dev/null
+++ b/libs/ffvpx/libavcodec/x86/vp8dsp.asm
@@ -0,0 +1,1231 @@
+;******************************************************************************
+;* VP8 MMXEXT optimizations
+;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
+;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+fourtap_filter_hw_m: times 4 dw -6, 123
+ times 4 dw 12, -1
+ times 4 dw -9, 93
+ times 4 dw 50, -6
+ times 4 dw -6, 50
+ times 4 dw 93, -9
+ times 4 dw -1, 12
+ times 4 dw 123, -6
+
+sixtap_filter_hw_m: times 4 dw 2, -11
+ times 4 dw 108, 36
+ times 4 dw -8, 1
+ times 4 dw 3, -16
+ times 4 dw 77, 77
+ times 4 dw -16, 3
+ times 4 dw 1, -8
+ times 4 dw 36, 108
+ times 4 dw -11, 2
+
+fourtap_filter_hb_m: times 8 db -6, 123
+ times 8 db 12, -1
+ times 8 db -9, 93
+ times 8 db 50, -6
+ times 8 db -6, 50
+ times 8 db 93, -9
+ times 8 db -1, 12
+ times 8 db 123, -6
+
+sixtap_filter_hb_m: times 8 db 2, 1
+ times 8 db -11, 108
+ times 8 db 36, -8
+ times 8 db 3, 3
+ times 8 db -16, 77
+ times 8 db 77, -16
+ times 8 db 1, 2
+ times 8 db -8, 36
+ times 8 db 108, -11
+
+fourtap_filter_v_m: times 8 dw -6
+ times 8 dw 123
+ times 8 dw 12
+ times 8 dw -1
+ times 8 dw -9
+ times 8 dw 93
+ times 8 dw 50
+ times 8 dw -6
+ times 8 dw -6
+ times 8 dw 50
+ times 8 dw 93
+ times 8 dw -9
+ times 8 dw -1
+ times 8 dw 12
+ times 8 dw 123
+ times 8 dw -6
+
+sixtap_filter_v_m: times 8 dw 2
+ times 8 dw -11
+ times 8 dw 108
+ times 8 dw 36
+ times 8 dw -8
+ times 8 dw 1
+ times 8 dw 3
+ times 8 dw -16
+ times 8 dw 77
+ times 8 dw 77
+ times 8 dw -16
+ times 8 dw 3
+ times 8 dw 1
+ times 8 dw -8
+ times 8 dw 36
+ times 8 dw 108
+ times 8 dw -11
+ times 8 dw 2
+
+bilinear_filter_vw_m: times 8 dw 1
+ times 8 dw 2
+ times 8 dw 3
+ times 8 dw 4
+ times 8 dw 5
+ times 8 dw 6
+ times 8 dw 7
+
+bilinear_filter_vb_m: times 8 db 7, 1
+ times 8 db 6, 2
+ times 8 db 5, 3
+ times 8 db 4, 4
+ times 8 db 3, 5
+ times 8 db 2, 6
+ times 8 db 1, 7
+
+%ifdef PIC
+%define fourtap_filter_hw picregq
+%define sixtap_filter_hw picregq
+%define fourtap_filter_hb picregq
+%define sixtap_filter_hb picregq
+%define fourtap_filter_v picregq
+%define sixtap_filter_v picregq
+%define bilinear_filter_vw picregq
+%define bilinear_filter_vb picregq
+%define npicregs 1
+%else
+%define fourtap_filter_hw fourtap_filter_hw_m
+%define sixtap_filter_hw sixtap_filter_hw_m
+%define fourtap_filter_hb fourtap_filter_hb_m
+%define sixtap_filter_hb sixtap_filter_hb_m
+%define fourtap_filter_v fourtap_filter_v_m
+%define sixtap_filter_v sixtap_filter_v_m
+%define bilinear_filter_vw bilinear_filter_vw_m
+%define bilinear_filter_vb bilinear_filter_vb_m
+%define npicregs 0
+%endif
+
+filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
+
+filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
+filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
+filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
+
+pw_20091: times 4 dw 20091
+pw_17734: times 4 dw 17734
+
+cextern pw_3
+cextern pw_4
+cextern pw_64
+cextern pw_256
+
+SECTION .text
+
+;-------------------------------------------------------------------------------
+; subpel MC functions:
+;
+; void ff_put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, ptrdiff_t deststride,
+; uint8_t *src, ptrdiff_t srcstride,
+; int height, int mx, int my);
+;-------------------------------------------------------------------------------
+
+%macro FILTER_SSSE3 1
+cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, height, mx, picreg
+ lea mxd, [mxq*3]
+ mova m3, [filter_h6_shuf2]
+ mova m4, [filter_h6_shuf3]
+%ifdef PIC
+ lea picregq, [sixtap_filter_hb_m]
+%endif
+ mova m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes
+ mova m6, [sixtap_filter_hb+mxq*8-32]
+ mova m7, [sixtap_filter_hb+mxq*8-16]
+
+.nextrow:
+ movu m0, [srcq-2]
+ mova m1, m0
+ mova m2, m0
+%if mmsize == 8
+; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
+; shuffle with a memory operand
+ punpcklbw m0, [srcq+3]
+%else
+ pshufb m0, [filter_h6_shuf1]
+%endif
+ pshufb m1, m3
+ pshufb m2, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m6
+ pmaddubsw m2, m7
+ paddsw m0, m1
+ paddsw m0, m2
+ pmulhrsw m0, [pw_256]
+ packuswb m0, m0
+ movh [dstq], m0 ; store
+
+ ; go to next line
+ add dstq, dststrideq
+ add srcq, srcstrideq
+ dec heightd ; next row
+ jg .nextrow
+ REP_RET
+
+cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
+ shl mxd, 4
+ mova m2, [pw_256]
+ mova m3, [filter_h2_shuf]
+ mova m4, [filter_h4_shuf]
+%ifdef PIC
+ lea picregq, [fourtap_filter_hb_m]
+%endif
+ mova m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes
+ mova m6, [fourtap_filter_hb+mxq]
+
+.nextrow:
+ movu m0, [srcq-1]
+ mova m1, m0
+ pshufb m0, m3
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m6
+ paddsw m0, m1
+ pmulhrsw m0, m2
+ packuswb m0, m0
+ movh [dstq], m0 ; store
+
+ ; go to next line
+ add dstq, dststrideq
+ add srcq, srcstrideq
+ dec heightd ; next row
+ jg .nextrow
+ REP_RET
+
+cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
+ shl myd, 4
+%ifdef PIC
+ lea picregq, [fourtap_filter_hb_m]
+%endif
+ mova m5, [fourtap_filter_hb+myq-16]
+ mova m6, [fourtap_filter_hb+myq]
+ mova m7, [pw_256]
+
+ ; read 3 lines
+ sub srcq, srcstrideq
+ movh m0, [srcq]
+ movh m1, [srcq+ srcstrideq]
+ movh m2, [srcq+2*srcstrideq]
+ add srcq, srcstrideq
+
+.nextrow:
+ movh m3, [srcq+2*srcstrideq] ; read new row
+ mova m4, m0
+ mova m0, m1
+ punpcklbw m4, m1
+ mova m1, m2
+ punpcklbw m2, m3
+ pmaddubsw m4, m5
+ pmaddubsw m2, m6
+ paddsw m4, m2
+ mova m2, m3
+ pmulhrsw m4, m7
+ packuswb m4, m4
+ movh [dstq], m4
+
+ ; go to next line
+ add dstq, dststrideq
+ add srcq, srcstrideq
+ dec heightd ; next row
+ jg .nextrow
+ REP_RET
+
+cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
+ lea myd, [myq*3]
+%ifdef PIC
+ lea picregq, [sixtap_filter_hb_m]
+%endif
+ lea myq, [sixtap_filter_hb+myq*8]
+
+ ; read 5 lines
+ sub srcq, srcstrideq
+ sub srcq, srcstrideq
+ movh m0, [srcq]
+ movh m1, [srcq+srcstrideq]
+ movh m2, [srcq+srcstrideq*2]
+ lea srcq, [srcq+srcstrideq*2]
+ add srcq, srcstrideq
+ movh m3, [srcq]
+ movh m4, [srcq+srcstrideq]
+
+.nextrow:
+ movh m5, [srcq+2*srcstrideq] ; read new row
+ mova m6, m0
+ punpcklbw m6, m5
+ mova m0, m1
+ punpcklbw m1, m2
+ mova m7, m3
+ punpcklbw m7, m4
+ pmaddubsw m6, [myq-48]
+ pmaddubsw m1, [myq-32]
+ pmaddubsw m7, [myq-16]
+ paddsw m6, m1
+ paddsw m6, m7
+ mova m1, m2
+ mova m2, m3
+ pmulhrsw m6, [pw_256]
+ mova m3, m4
+ packuswb m6, m6
+ mova m4, m5
+ movh [dstq], m6
+
+ ; go to next line
+ add dstq, dststrideq
+ add srcq, srcstrideq
+ dec heightd ; next row
+ jg .nextrow
+ REP_RET
+%endmacro
+
+INIT_MMX ssse3
+FILTER_SSSE3 4
+INIT_XMM ssse3
+FILTER_SSSE3 8
+
+; 4x4 block, H-only 4-tap filter
+INIT_MMX mmxext
+cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
+ shl mxd, 4
+%ifdef PIC
+ lea picregq, [fourtap_filter_hw_m]
+%endif
+ movq mm4, [fourtap_filter_hw+mxq-16] ; set up 4tap filter in words
+ movq mm5, [fourtap_filter_hw+mxq]
+ movq mm7, [pw_64]
+ pxor mm6, mm6
+
+.nextrow:
+ movq mm1, [srcq-1] ; (ABCDEFGH) load 8 horizontal pixels
+
+ ; first set of 2 pixels
+ movq mm2, mm1 ; byte ABCD..
+ punpcklbw mm1, mm6 ; byte->word ABCD
+ pshufw mm0, mm2, 9 ; byte CDEF..
+ punpcklbw mm0, mm6 ; byte->word CDEF
+ pshufw mm3, mm1, 0x94 ; word ABBC
+ pshufw mm1, mm0, 0x94 ; word CDDE
+ pmaddwd mm3, mm4 ; multiply 2px with F0/F1
+ movq mm0, mm1 ; backup for second set of pixels
+ pmaddwd mm1, mm5 ; multiply 2px with F2/F3
+ paddd mm3, mm1 ; finish 1st 2px
+
+ ; second set of 2 pixels, use backup of above
+ punpckhbw mm2, mm6 ; byte->word EFGH
+ pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1
+ pshufw mm1, mm2, 0x94 ; word EFFG
+ pmaddwd mm1, mm5 ; multiply 2px with F2/F3
+ paddd mm0, mm1 ; finish 2nd 2px
+
+ ; merge two sets of 2 pixels into one set of 4, round/clip/store
+ packssdw mm3, mm0 ; merge dword->word (4px)
+ paddsw mm3, mm7 ; rounding
+ psraw mm3, 7
+ packuswb mm3, mm6 ; clip and word->bytes
+ movd [dstq], mm3 ; store
+
+ ; go to next line
+ add dstq, dststrideq
+ add srcq, srcstrideq
+ dec heightd ; next row
+ jg .nextrow
+ REP_RET
+
+; 4x4 block, H-only 6-tap filter
+INIT_MMX mmxext
+cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
+ lea mxd, [mxq*3]
+%ifdef PIC
+ lea picregq, [sixtap_filter_hw_m]
+%endif
+ movq mm4, [sixtap_filter_hw+mxq*8-48] ; set up 4tap filter in words
+ movq mm5, [sixtap_filter_hw+mxq*8-32]
+ movq mm6, [sixtap_filter_hw+mxq*8-16]
+ movq mm7, [pw_64]
+ pxor mm3, mm3
+
+.nextrow:
+ movq mm1, [srcq-2] ; (ABCDEFGH) load 8 horizontal pixels
+
+ ; first set of 2 pixels
+ movq mm2, mm1 ; byte ABCD..
+ punpcklbw mm1, mm3 ; byte->word ABCD
+ pshufw mm0, mm2, 0x9 ; byte CDEF..
+ punpckhbw mm2, mm3 ; byte->word EFGH
+ punpcklbw mm0, mm3 ; byte->word CDEF
+ pshufw mm1, mm1, 0x94 ; word ABBC
+ pshufw mm2, mm2, 0x94 ; word EFFG
+ pmaddwd mm1, mm4 ; multiply 2px with F0/F1
+ pshufw mm3, mm0, 0x94 ; word CDDE
+ movq mm0, mm3 ; backup for second set of pixels
+ pmaddwd mm3, mm5 ; multiply 2px with F2/F3
+ paddd mm1, mm3 ; add to 1st 2px cache
+ movq mm3, mm2 ; backup for second set of pixels
+ pmaddwd mm2, mm6 ; multiply 2px with F4/F5
+ paddd mm1, mm2 ; finish 1st 2px
+
+ ; second set of 2 pixels, use backup of above
+ movd mm2, [srcq+3] ; byte FGHI (prevent overreads)
+ pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1
+ pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3
+ paddd mm0, mm3 ; add to 2nd 2px cache
+ pxor mm3, mm3
+ punpcklbw mm2, mm3 ; byte->word FGHI
+ pshufw mm2, mm2, 0xE9 ; word GHHI
+ pmaddwd mm2, mm6 ; multiply 2px with F4/F5
+ paddd mm0, mm2 ; finish 2nd 2px
+
+ ; merge two sets of 2 pixels into one set of 4, round/clip/store
+ packssdw mm1, mm0 ; merge dword->word (4px)
+ paddsw mm1, mm7 ; rounding
+ psraw mm1, 7
+ packuswb mm1, mm3 ; clip and word->bytes
+ movd [dstq], mm1 ; store
+
+ ; go to next line
+ add dstq, dststrideq
+ add srcq, srcstrideq
+ dec heightd ; next row
+ jg .nextrow
+ REP_RET
+
+INIT_XMM sse2
+cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, height, mx, picreg
+ shl mxd, 5
+%ifdef PIC
+ lea picregq, [fourtap_filter_v_m]
+%endif
+ lea mxq, [fourtap_filter_v+mxq-32]
+ pxor m7, m7
+ mova m4, [pw_64]
+ mova m5, [mxq+ 0]
+ mova m6, [mxq+16]
+%ifdef m8
+ mova m8, [mxq+32]
+ mova m9, [mxq+48]
+%endif
+.nextrow:
+ movq m0, [srcq-1]
+ movq m1, [srcq-0]
+ movq m2, [srcq+1]
+ movq m3, [srcq+2]
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ punpcklbw m2, m7
+ punpcklbw m3, m7
+ pmullw m0, m5
+ pmullw m1, m6
+%ifdef m8
+ pmullw m2, m8
+ pmullw m3, m9
+%else
+ pmullw m2, [mxq+32]
+ pmullw m3, [mxq+48]
+%endif
+ paddsw m0, m1
+ paddsw m2, m3
+ paddsw m0, m2
+ paddsw m0, m4
+ psraw m0, 7
+ packuswb m0, m7
+ movh [dstq], m0 ; store
+
+ ; go to next line
+ add dstq, dststrideq
+ add srcq, srcstrideq
+ dec heightd ; next row
+ jg .nextrow
+ REP_RET
+
+INIT_XMM sse2
+cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, height, mx, picreg
+ lea mxd, [mxq*3]
+ shl mxd, 4
+%ifdef PIC
+ lea picregq, [sixtap_filter_v_m]
+%endif
+ lea mxq, [sixtap_filter_v+mxq-96]
+ pxor m7, m7
+ mova m6, [pw_64]
+%ifdef m8
+ mova m8, [mxq+ 0]
+ mova m9, [mxq+16]
+ mova m10, [mxq+32]
+ mova m11, [mxq+48]
+ mova m12, [mxq+64]
+ mova m13, [mxq+80]
+%endif
+.nextrow:
+ movq m0, [srcq-2]
+ movq m1, [srcq-1]
+ movq m2, [srcq-0]
+ movq m3, [srcq+1]
+ movq m4, [srcq+2]
+ movq m5, [srcq+3]
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ punpcklbw m2, m7
+ punpcklbw m3, m7
+ punpcklbw m4, m7
+ punpcklbw m5, m7
+%ifdef m8
+ pmullw m0, m8
+ pmullw m1, m9
+ pmullw m2, m10
+ pmullw m3, m11
+ pmullw m4, m12
+ pmullw m5, m13
+%else
+ pmullw m0, [mxq+ 0]
+ pmullw m1, [mxq+16]
+ pmullw m2, [mxq+32]
+ pmullw m3, [mxq+48]
+ pmullw m4, [mxq+64]
+ pmullw m5, [mxq+80]
+%endif
+ paddsw m1, m4
+ paddsw m0, m5
+ paddsw m1, m2
+ paddsw m0, m3
+ paddsw m0, m1
+ paddsw m0, m6
+ psraw m0, 7
+ packuswb m0, m7
+ movh [dstq], m0 ; store
+
+ ; go to next line
+ add dstq, dststrideq
+ add srcq, srcstrideq
+ dec heightd ; next row
+ jg .nextrow
+ REP_RET
+
+%macro FILTER_V 1
+; 4x4 block, V-only 4-tap filter
+cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
+ shl myd, 5
+%ifdef PIC
+ lea picregq, [fourtap_filter_v_m]
+%endif
+ lea myq, [fourtap_filter_v+myq-32]
+ mova m6, [pw_64]
+ pxor m7, m7
+ mova m5, [myq+48]
+
+ ; read 3 lines
+ sub srcq, srcstrideq
+ movh m0, [srcq]
+ movh m1, [srcq+ srcstrideq]
+ movh m2, [srcq+2*srcstrideq]
+ add srcq, srcstrideq
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ punpcklbw m2, m7
+
+.nextrow:
+ ; first calculate negative taps (to prevent losing positive overflows)
+ movh m4, [srcq+2*srcstrideq] ; read new row
+ punpcklbw m4, m7
+ mova m3, m4
+ pmullw m0, [myq+0]
+ pmullw m4, m5
+ paddsw m4, m0
+
+ ; then calculate positive taps
+ mova m0, m1
+ pmullw m1, [myq+16]
+ paddsw m4, m1
+ mova m1, m2
+ pmullw m2, [myq+32]
+ paddsw m4, m2
+ mova m2, m3
+
+ ; round/clip/store
+ paddsw m4, m6
+ psraw m4, 7
+ packuswb m4, m7
+ movh [dstq], m4
+
+ ; go to next line
+ add dstq, dststrideq
+ add srcq, srcstrideq
+ dec heightd ; next row
+ jg .nextrow
+ REP_RET
+
+
+; 4x4 block, V-only 6-tap filter
+cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
+ shl myd, 4
+ lea myq, [myq*3]
+%ifdef PIC
+ lea picregq, [sixtap_filter_v_m]
+%endif
+ lea myq, [sixtap_filter_v+myq-96]
+ pxor m7, m7
+
+ ; read 5 lines
+ sub srcq, srcstrideq
+ sub srcq, srcstrideq
+ movh m0, [srcq]
+ movh m1, [srcq+srcstrideq]
+ movh m2, [srcq+srcstrideq*2]
+ lea srcq, [srcq+srcstrideq*2]
+ add srcq, srcstrideq
+ movh m3, [srcq]
+ movh m4, [srcq+srcstrideq]
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ punpcklbw m2, m7
+ punpcklbw m3, m7
+ punpcklbw m4, m7
+
+.nextrow:
+ ; first calculate negative taps (to prevent losing positive overflows)
+ mova m5, m1
+ pmullw m5, [myq+16]
+ mova m6, m4
+ pmullw m6, [myq+64]
+ paddsw m6, m5
+
+ ; then calculate positive taps
+ movh m5, [srcq+2*srcstrideq] ; read new row
+ punpcklbw m5, m7
+ pmullw m0, [myq+0]
+ paddsw m6, m0
+ mova m0, m1
+ mova m1, m2
+ pmullw m2, [myq+32]
+ paddsw m6, m2
+ mova m2, m3
+ pmullw m3, [myq+48]
+ paddsw m6, m3
+ mova m3, m4
+ mova m4, m5
+ pmullw m5, [myq+80]
+ paddsw m6, m5
+
+ ; round/clip/store
+ paddsw m6, [pw_64]
+ psraw m6, 7
+ packuswb m6, m7
+ movh [dstq], m6
+
+ ; go to next line
+ add dstq, dststrideq
+ add srcq, srcstrideq
+ dec heightd ; next row
+ jg .nextrow
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+FILTER_V 4
+INIT_XMM sse2
+FILTER_V 8
+
+%macro FILTER_BILINEAR 1
+%if cpuflag(ssse3)
+cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, picreg, my
+ shl myd, 4
+%ifdef PIC
+ lea picregq, [bilinear_filter_vb_m]
+%endif
+ pxor m4, m4
+ mova m3, [bilinear_filter_vb+myq-16]
+.nextrow:
+ movh m0, [srcq+srcstrideq*0]
+ movh m1, [srcq+srcstrideq*1]
+ movh m2, [srcq+srcstrideq*2]
+ punpcklbw m0, m1
+ punpcklbw m1, m2
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ psraw m0, 2
+ psraw m1, 2
+ pavgw m0, m4
+ pavgw m1, m4
+%if mmsize==8
+ packuswb m0, m0
+ packuswb m1, m1
+ movh [dstq+dststrideq*0], m0
+ movh [dstq+dststrideq*1], m1
+%else
+ packuswb m0, m1
+ movh [dstq+dststrideq*0], m0
+ movhps [dstq+dststrideq*1], m0
+%endif
+%else ; cpuflag(ssse3)
+cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, picreg, my
+ shl myd, 4
+%ifdef PIC
+ lea picregq, [bilinear_filter_vw_m]
+%endif
+ pxor m6, m6
+ mova m5, [bilinear_filter_vw+myq-1*16]
+ neg myq
+ mova m4, [bilinear_filter_vw+myq+7*16]
+.nextrow:
+ movh m0, [srcq+srcstrideq*0]
+ movh m1, [srcq+srcstrideq*1]
+ movh m3, [srcq+srcstrideq*2]
+ punpcklbw m0, m6
+ punpcklbw m1, m6
+ punpcklbw m3, m6
+ mova m2, m1
+ pmullw m0, m4
+ pmullw m1, m5
+ pmullw m2, m4
+ pmullw m3, m5
+ paddsw m0, m1
+ paddsw m2, m3
+ psraw m0, 2
+ psraw m2, 2
+ pavgw m0, m6
+ pavgw m2, m6
+%if mmsize == 8
+ packuswb m0, m0
+ packuswb m2, m2
+ movh [dstq+dststrideq*0], m0
+ movh [dstq+dststrideq*1], m2
+%else
+ packuswb m0, m2
+ movh [dstq+dststrideq*0], m0
+ movhps [dstq+dststrideq*1], m0
+%endif
+%endif ; cpuflag(ssse3)
+
+ lea dstq, [dstq+dststrideq*2]
+ lea srcq, [srcq+srcstrideq*2]
+ sub heightd, 2
+ jg .nextrow
+ REP_RET
+
+%if cpuflag(ssse3)
+cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride, height, mx, picreg
+ shl mxd, 4
+%ifdef PIC
+ lea picregq, [bilinear_filter_vb_m]
+%endif
+ pxor m4, m4
+ mova m2, [filter_h2_shuf]
+ mova m3, [bilinear_filter_vb+mxq-16]
+.nextrow:
+ movu m0, [srcq+srcstrideq*0]
+ movu m1, [srcq+srcstrideq*1]
+ pshufb m0, m2
+ pshufb m1, m2
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ psraw m0, 2
+ psraw m1, 2
+ pavgw m0, m4
+ pavgw m1, m4
+%if mmsize==8
+ packuswb m0, m0
+ packuswb m1, m1
+ movh [dstq+dststrideq*0], m0
+ movh [dstq+dststrideq*1], m1
+%else
+ packuswb m0, m1
+ movh [dstq+dststrideq*0], m0
+ movhps [dstq+dststrideq*1], m0
+%endif
+%else ; cpuflag(ssse3)
+cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
+ shl mxd, 4
+%ifdef PIC
+ lea picregq, [bilinear_filter_vw_m]
+%endif
+ pxor m6, m6
+ mova m5, [bilinear_filter_vw+mxq-1*16]
+ neg mxq
+ mova m4, [bilinear_filter_vw+mxq+7*16]
+.nextrow:
+ movh m0, [srcq+srcstrideq*0+0]
+ movh m1, [srcq+srcstrideq*0+1]
+ movh m2, [srcq+srcstrideq*1+0]
+ movh m3, [srcq+srcstrideq*1+1]
+ punpcklbw m0, m6
+ punpcklbw m1, m6
+ punpcklbw m2, m6
+ punpcklbw m3, m6
+ pmullw m0, m4
+ pmullw m1, m5
+ pmullw m2, m4
+ pmullw m3, m5
+ paddsw m0, m1
+ paddsw m2, m3
+ psraw m0, 2
+ psraw m2, 2
+ pavgw m0, m6
+ pavgw m2, m6
+%if mmsize == 8
+ packuswb m0, m0
+ packuswb m2, m2
+ movh [dstq+dststrideq*0], m0
+ movh [dstq+dststrideq*1], m2
+%else
+ packuswb m0, m2
+ movh [dstq+dststrideq*0], m0
+ movhps [dstq+dststrideq*1], m0
+%endif
+%endif ; cpuflag(ssse3)
+
+ lea dstq, [dstq+dststrideq*2]
+ lea srcq, [srcq+srcstrideq*2]
+ sub heightd, 2
+ jg .nextrow
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+FILTER_BILINEAR 4
+INIT_XMM sse2
+FILTER_BILINEAR 8
+INIT_MMX ssse3
+FILTER_BILINEAR 4
+INIT_XMM ssse3
+FILTER_BILINEAR 8
+
+INIT_MMX mmx
+cglobal put_vp8_pixels8, 5, 5, 0, dst, dststride, src, srcstride, height
+.nextrow:
+ movq mm0, [srcq+srcstrideq*0]
+ movq mm1, [srcq+srcstrideq*1]
+ lea srcq, [srcq+srcstrideq*2]
+ movq [dstq+dststrideq*0], mm0
+ movq [dstq+dststrideq*1], mm1
+ lea dstq, [dstq+dststrideq*2]
+ sub heightd, 2
+ jg .nextrow
+ REP_RET
+
+%if ARCH_X86_32
+INIT_MMX mmx
+cglobal put_vp8_pixels16, 5, 5, 0, dst, dststride, src, srcstride, height
+.nextrow:
+ movq mm0, [srcq+srcstrideq*0+0]
+ movq mm1, [srcq+srcstrideq*0+8]
+ movq mm2, [srcq+srcstrideq*1+0]
+ movq mm3, [srcq+srcstrideq*1+8]
+ lea srcq, [srcq+srcstrideq*2]
+ movq [dstq+dststrideq*0+0], mm0
+ movq [dstq+dststrideq*0+8], mm1
+ movq [dstq+dststrideq*1+0], mm2
+ movq [dstq+dststrideq*1+8], mm3
+ lea dstq, [dstq+dststrideq*2]
+ sub heightd, 2
+ jg .nextrow
+ REP_RET
+%endif
+
+INIT_XMM sse
+cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height
+.nextrow:
+ movups xmm0, [srcq+srcstrideq*0]
+ movups xmm1, [srcq+srcstrideq*1]
+ lea srcq, [srcq+srcstrideq*2]
+ movaps [dstq+dststrideq*0], xmm0
+ movaps [dstq+dststrideq*1], xmm1
+ lea dstq, [dstq+dststrideq*2]
+ sub heightd, 2
+ jg .nextrow
+ REP_RET
+
+;-----------------------------------------------------------------------------
+; void ff_vp8_idct_dc_add_<opt>(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
+;-----------------------------------------------------------------------------
+
+%macro ADD_DC 4
+ %4 m2, [dst1q+%3]
+ %4 m3, [dst1q+strideq+%3]
+ %4 m4, [dst2q+%3]
+ %4 m5, [dst2q+strideq+%3]
+ paddusb m2, %1
+ paddusb m3, %1
+ paddusb m4, %1
+ paddusb m5, %1
+ psubusb m2, %2
+ psubusb m3, %2
+ psubusb m4, %2
+ psubusb m5, %2
+ %4 [dst1q+%3], m2
+ %4 [dst1q+strideq+%3], m3
+ %4 [dst2q+%3], m4
+ %4 [dst2q+strideq+%3], m5
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+cglobal vp8_idct_dc_add, 3, 3, 0, dst, block, stride
+ ; load data
+ movd m0, [blockq]
+
+ ; calculate DC
+ paddw m0, [pw_4]
+ pxor m1, m1
+ psraw m0, 3
+ movd [blockq], m1
+ psubw m1, m0
+ packuswb m0, m0
+ packuswb m1, m1
+ punpcklbw m0, m0
+ punpcklbw m1, m1
+ punpcklwd m0, m0
+ punpcklwd m1, m1
+
+ ; add DC
+ DEFINE_ARGS dst1, dst2, stride
+ lea dst2q, [dst1q+strideq*2]
+ ADD_DC m0, m1, 0, movh
+ RET
+%endif
+
+%macro VP8_IDCT_DC_ADD 0
+cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride
+ ; load data
+ movd m0, [blockq]
+ pxor m1, m1
+
+ ; calculate DC
+ paddw m0, [pw_4]
+ movd [blockq], m1
+ DEFINE_ARGS dst1, dst2, stride
+ lea dst2q, [dst1q+strideq*2]
+ movd m2, [dst1q]
+ movd m3, [dst1q+strideq]
+ movd m4, [dst2q]
+ movd m5, [dst2q+strideq]
+ psraw m0, 3
+ pshuflw m0, m0, 0
+ punpcklqdq m0, m0
+ punpckldq m2, m3
+ punpckldq m4, m5
+ punpcklbw m2, m1
+ punpcklbw m4, m1
+ paddw m2, m0
+ paddw m4, m0
+ packuswb m2, m4
+ movd [dst1q], m2
+%if cpuflag(sse4)
+ pextrd [dst1q+strideq], m2, 1
+ pextrd [dst2q], m2, 2
+ pextrd [dst2q+strideq], m2, 3
+%else
+ psrldq m2, 4
+ movd [dst1q+strideq], m2
+ psrldq m2, 4
+ movd [dst2q], m2
+ psrldq m2, 4
+ movd [dst2q+strideq], m2
+%endif
+ RET
+%endmacro
+
+INIT_XMM sse2
+VP8_IDCT_DC_ADD
+INIT_XMM sse4
+VP8_IDCT_DC_ADD
+
+;-----------------------------------------------------------------------------
+; void ff_vp8_idct_dc_add4y_<opt>(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
+;-----------------------------------------------------------------------------
+
+%if ARCH_X86_32
+INIT_MMX mmx
+cglobal vp8_idct_dc_add4y, 3, 3, 0, dst, block, stride
+ ; load data
+ movd m0, [blockq+32*0] ; A
+ movd m1, [blockq+32*2] ; C
+ punpcklwd m0, [blockq+32*1] ; A B
+ punpcklwd m1, [blockq+32*3] ; C D
+ punpckldq m0, m1 ; A B C D
+ pxor m6, m6
+
+ ; calculate DC
+ paddw m0, [pw_4]
+ movd [blockq+32*0], m6
+ movd [blockq+32*1], m6
+ movd [blockq+32*2], m6
+ movd [blockq+32*3], m6
+ psraw m0, 3
+ psubw m6, m0
+ packuswb m0, m0
+ packuswb m6, m6
+ punpcklbw m0, m0 ; AABBCCDD
+ punpcklbw m6, m6 ; AABBCCDD
+ movq m1, m0
+ movq m7, m6
+ punpcklbw m0, m0 ; AAAABBBB
+ punpckhbw m1, m1 ; CCCCDDDD
+ punpcklbw m6, m6 ; AAAABBBB
+ punpckhbw m7, m7 ; CCCCDDDD
+
+ ; add DC
+ DEFINE_ARGS dst1, dst2, stride
+ lea dst2q, [dst1q+strideq*2]
+ ADD_DC m0, m6, 0, mova
+ ADD_DC m1, m7, 8, mova
+ RET
+%endif
+
+INIT_XMM sse2
+cglobal vp8_idct_dc_add4y, 3, 3, 6, dst, block, stride
+ ; load data
+ movd m0, [blockq+32*0] ; A
+ movd m1, [blockq+32*2] ; C
+ punpcklwd m0, [blockq+32*1] ; A B
+ punpcklwd m1, [blockq+32*3] ; C D
+ punpckldq m0, m1 ; A B C D
+ pxor m1, m1
+
+ ; calculate DC
+ paddw m0, [pw_4]
+ movd [blockq+32*0], m1
+ movd [blockq+32*1], m1
+ movd [blockq+32*2], m1
+ movd [blockq+32*3], m1
+ psraw m0, 3
+ psubw m1, m0
+ packuswb m0, m0
+ packuswb m1, m1
+ punpcklbw m0, m0
+ punpcklbw m1, m1
+ punpcklbw m0, m0
+ punpcklbw m1, m1
+
+ ; add DC
+ DEFINE_ARGS dst1, dst2, stride
+ lea dst2q, [dst1q+strideq*2]
+ ADD_DC m0, m1, 0, mova
+ RET
+
+;-----------------------------------------------------------------------------
+; void ff_vp8_idct_dc_add4uv_<opt>(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
+;-----------------------------------------------------------------------------
+
+INIT_MMX mmx
+cglobal vp8_idct_dc_add4uv, 3, 3, 0, dst, block, stride
+ ; load data
+ movd m0, [blockq+32*0] ; A
+ movd m1, [blockq+32*2] ; C
+ punpcklwd m0, [blockq+32*1] ; A B
+ punpcklwd m1, [blockq+32*3] ; C D
+ punpckldq m0, m1 ; A B C D
+ pxor m6, m6
+
+ ; calculate DC
+ paddw m0, [pw_4]
+ movd [blockq+32*0], m6
+ movd [blockq+32*1], m6
+ movd [blockq+32*2], m6
+ movd [blockq+32*3], m6
+ psraw m0, 3
+ psubw m6, m0
+ packuswb m0, m0
+ packuswb m6, m6
+ punpcklbw m0, m0 ; AABBCCDD
+ punpcklbw m6, m6 ; AABBCCDD
+ movq m1, m0
+ movq m7, m6
+ punpcklbw m0, m0 ; AAAABBBB
+ punpckhbw m1, m1 ; CCCCDDDD
+ punpcklbw m6, m6 ; AAAABBBB
+ punpckhbw m7, m7 ; CCCCDDDD
+
+ ; add DC
+ DEFINE_ARGS dst1, dst2, stride
+ lea dst2q, [dst1q+strideq*2]
+ ADD_DC m0, m6, 0, mova
+ lea dst1q, [dst1q+strideq*4]
+ lea dst2q, [dst2q+strideq*4]
+ ADD_DC m1, m7, 0, mova
+ RET
+
+;-----------------------------------------------------------------------------
+; void ff_vp8_idct_add_<opt>(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
+;-----------------------------------------------------------------------------
+
+; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2)
+; this macro assumes that m6/m7 have words for 20091/17734 loaded
+%macro VP8_MULTIPLY_SUMSUB 4
+ mova %3, %1
+ mova %4, %2
+ pmulhw %3, m6 ;20091(1)
+ pmulhw %4, m6 ;20091(2)
+ paddw %3, %1
+ paddw %4, %2
+ paddw %1, %1
+ paddw %2, %2
+ pmulhw %1, m7 ;35468(1)
+ pmulhw %2, m7 ;35468(2)
+ psubw %1, %4
+ paddw %2, %3
+%endmacro
+
+; calculate x0=%1+%3; x1=%1-%3
+; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4)
+; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3)
+; %5/%6 are temporary registers
+; we assume m6/m7 have constant words 20091/17734 loaded in them
+%macro VP8_IDCT_TRANSFORM4x4_1D 6
+ SUMSUB_BA w, %3, %1, %5 ;t0, t1
+ VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3
+ SUMSUB_BA w, %4, %3, %5 ;tmp0, tmp3
+ SUMSUB_BA w, %2, %1, %5 ;tmp1, tmp2
+ SWAP %4, %1
+ SWAP %4, %3
+%endmacro
+
+%macro VP8_IDCT_ADD 0
+cglobal vp8_idct_add, 3, 3, 0, dst, block, stride
+ ; load block data
+ movq m0, [blockq+ 0]
+ movq m1, [blockq+ 8]
+ movq m2, [blockq+16]
+ movq m3, [blockq+24]
+ movq m6, [pw_20091]
+ movq m7, [pw_17734]
+%if cpuflag(sse)
+ xorps xmm0, xmm0
+ movaps [blockq+ 0], xmm0
+ movaps [blockq+16], xmm0
+%else
+ pxor m4, m4
+ movq [blockq+ 0], m4
+ movq [blockq+ 8], m4
+ movq [blockq+16], m4
+ movq [blockq+24], m4
+%endif
+
+ ; actual IDCT
+ VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+ paddw m0, [pw_4]
+ VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+
+ ; store
+ pxor m4, m4
+ DEFINE_ARGS dst1, dst2, stride
+ lea dst2q, [dst1q+2*strideq]
+ STORE_DIFFx2 m0, m1, m6, m7, m4, 3, dst1q, strideq
+ STORE_DIFFx2 m2, m3, m6, m7, m4, 3, dst2q, strideq
+
+ RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+VP8_IDCT_ADD
+%endif
+INIT_MMX sse
+VP8_IDCT_ADD
+
+;-----------------------------------------------------------------------------
+; void ff_vp8_luma_dc_wht(int16_t block[4][4][16], int16_t dc[16])
+;-----------------------------------------------------------------------------
+
+%macro SCATTER_WHT 3
+ movd dc1d, m%1
+ movd dc2d, m%2
+ mov [blockq+2*16*(0+%3)], dc1w
+ mov [blockq+2*16*(1+%3)], dc2w
+ shr dc1d, 16
+ shr dc2d, 16
+ psrlq m%1, 32
+ psrlq m%2, 32
+ mov [blockq+2*16*(4+%3)], dc1w
+ mov [blockq+2*16*(5+%3)], dc2w
+ movd dc1d, m%1
+ movd dc2d, m%2
+ mov [blockq+2*16*(8+%3)], dc1w
+ mov [blockq+2*16*(9+%3)], dc2w
+ shr dc1d, 16
+ shr dc2d, 16
+ mov [blockq+2*16*(12+%3)], dc1w
+ mov [blockq+2*16*(13+%3)], dc2w
+%endmacro
+
+%macro HADAMARD4_1D 4
+ SUMSUB_BADC w, %2, %1, %4, %3
+ SUMSUB_BADC w, %4, %2, %3, %1
+ SWAP %1, %4, %3
+%endmacro
+
+%macro VP8_DC_WHT 0
+cglobal vp8_luma_dc_wht, 2, 3, 0, block, dc1, dc2
+ movq m0, [dc1q]
+ movq m1, [dc1q+8]
+ movq m2, [dc1q+16]
+ movq m3, [dc1q+24]
+%if cpuflag(sse)
+ xorps xmm0, xmm0
+ movaps [dc1q+ 0], xmm0
+ movaps [dc1q+16], xmm0
+%else
+ pxor m4, m4
+ movq [dc1q+ 0], m4
+ movq [dc1q+ 8], m4
+ movq [dc1q+16], m4
+ movq [dc1q+24], m4
+%endif
+ HADAMARD4_1D 0, 1, 2, 3
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+ paddw m0, [pw_3]
+ HADAMARD4_1D 0, 1, 2, 3
+ psraw m0, 3
+ psraw m1, 3
+ psraw m2, 3
+ psraw m3, 3
+ SCATTER_WHT 0, 1, 0
+ SCATTER_WHT 2, 3, 2
+ RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+VP8_DC_WHT
+%endif
+INIT_MMX sse
+VP8_DC_WHT
diff --git a/libs/ffvpx/libavcodec/x86/vp8dsp_init.c b/libs/ffvpx/libavcodec/x86/vp8dsp_init.c
new file mode 100644
index 000000000..397b2518c
--- /dev/null
+++ b/libs/ffvpx/libavcodec/x86/vp8dsp_init.c
@@ -0,0 +1,467 @@
+/*
+ * VP8 DSP functions x86-optimized
+ * Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
+ * Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/vp8dsp.h"
+
+#if HAVE_X86ASM
+
+/*
+ * MC functions
+ */
+void ff_put_vp8_epel4_h4_mmxext(uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+void ff_put_vp8_epel4_h6_mmxext(uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+void ff_put_vp8_epel4_v4_mmxext(uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+void ff_put_vp8_epel4_v6_mmxext(uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+
+void ff_put_vp8_epel8_h4_sse2 (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+void ff_put_vp8_epel8_h6_sse2 (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+void ff_put_vp8_epel8_v4_sse2 (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+void ff_put_vp8_epel8_v6_sse2 (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+
+void ff_put_vp8_epel4_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+void ff_put_vp8_epel4_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+void ff_put_vp8_epel4_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+void ff_put_vp8_epel4_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+void ff_put_vp8_epel8_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+void ff_put_vp8_epel8_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+void ff_put_vp8_epel8_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+
+void ff_put_vp8_bilinear4_h_mmxext(uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+void ff_put_vp8_bilinear8_h_sse2 (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+void ff_put_vp8_bilinear4_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+
+void ff_put_vp8_bilinear4_v_mmxext(uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+void ff_put_vp8_bilinear8_v_sse2 (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+void ff_put_vp8_bilinear4_v_ssse3 (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+void ff_put_vp8_bilinear8_v_ssse3 (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+
+
+void ff_put_vp8_pixels8_mmx (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+void ff_put_vp8_pixels16_mmx(uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+void ff_put_vp8_pixels16_sse(uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+
+#define TAP_W16(OPT, FILTERTYPE, TAPTYPE) \
+static void ff_put_vp8_ ## FILTERTYPE ## 16_ ## TAPTYPE ## _ ## OPT( \
+ uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
+ ptrdiff_t srcstride, int height, int mx, int my) \
+{ \
+ ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
+ dst, dststride, src, srcstride, height, mx, my); \
+ ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
+ dst + 8, dststride, src + 8, srcstride, height, mx, my); \
+}
+#define TAP_W8(OPT, FILTERTYPE, TAPTYPE) \
+static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
+ uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
+ ptrdiff_t srcstride, int height, int mx, int my) \
+{ \
+ ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
+ dst, dststride, src, srcstride, height, mx, my); \
+ ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
+ dst + 4, dststride, src + 4, srcstride, height, mx, my); \
+}
+
+#if ARCH_X86_32
+TAP_W8 (mmxext, epel, h4)
+TAP_W8 (mmxext, epel, h6)
+TAP_W16(mmxext, epel, h6)
+TAP_W8 (mmxext, epel, v4)
+TAP_W8 (mmxext, epel, v6)
+TAP_W16(mmxext, epel, v6)
+TAP_W8 (mmxext, bilinear, h)
+TAP_W16(mmxext, bilinear, h)
+TAP_W8 (mmxext, bilinear, v)
+TAP_W16(mmxext, bilinear, v)
+#endif
+
+TAP_W16(sse2, epel, h6)
+TAP_W16(sse2, epel, v6)
+TAP_W16(sse2, bilinear, h)
+TAP_W16(sse2, bilinear, v)
+
+TAP_W16(ssse3, epel, h6)
+TAP_W16(ssse3, epel, v6)
+TAP_W16(ssse3, bilinear, h)
+TAP_W16(ssse3, bilinear, v)
+
+#define HVTAP(OPT, ALIGN, TAPNUMX, TAPNUMY, SIZE, MAXHEIGHT) \
+static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT( \
+ uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
+ ptrdiff_t srcstride, int height, int mx, int my) \
+{ \
+ LOCAL_ALIGNED(ALIGN, uint8_t, tmp, [SIZE * (MAXHEIGHT + TAPNUMY - 1)]); \
+ uint8_t *tmpptr = tmp + SIZE * (TAPNUMY / 2 - 1); \
+ src -= srcstride * (TAPNUMY / 2 - 1); \
+ ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## _ ## OPT( \
+ tmp, SIZE, src, srcstride, height + TAPNUMY - 1, mx, my); \
+ ff_put_vp8_epel ## SIZE ## _v ## TAPNUMY ## _ ## OPT( \
+ dst, dststride, tmpptr, SIZE, height, mx, my); \
+}
+
+#if ARCH_X86_32
+#define HVTAPMMX(x, y) \
+HVTAP(mmxext, 8, x, y, 4, 8) \
+HVTAP(mmxext, 8, x, y, 8, 16)
+
+HVTAP(mmxext, 8, 6, 6, 16, 16)
+#else
+#define HVTAPMMX(x, y) \
+HVTAP(mmxext, 8, x, y, 4, 8)
+#endif
+
+HVTAPMMX(4, 4)
+HVTAPMMX(4, 6)
+HVTAPMMX(6, 4)
+HVTAPMMX(6, 6)
+
+#define HVTAPSSE2(x, y, w) \
+HVTAP(sse2, 16, x, y, w, 16) \
+HVTAP(ssse3, 16, x, y, w, 16)
+
+HVTAPSSE2(4, 4, 8)
+HVTAPSSE2(4, 6, 8)
+HVTAPSSE2(6, 4, 8)
+HVTAPSSE2(6, 6, 8)
+HVTAPSSE2(6, 6, 16)
+
+HVTAP(ssse3, 16, 4, 4, 4, 8)
+HVTAP(ssse3, 16, 4, 6, 4, 8)
+HVTAP(ssse3, 16, 6, 4, 4, 8)
+HVTAP(ssse3, 16, 6, 6, 4, 8)
+
+#define HVBILIN(OPT, ALIGN, SIZE, MAXHEIGHT) \
+static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \
+ uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
+ ptrdiff_t srcstride, int height, int mx, int my) \
+{ \
+ LOCAL_ALIGNED(ALIGN, uint8_t, tmp, [SIZE * (MAXHEIGHT + 2)]); \
+ ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT( \
+ tmp, SIZE, src, srcstride, height + 1, mx, my); \
+ ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT( \
+ dst, dststride, tmp, SIZE, height, mx, my); \
+}
+
+HVBILIN(mmxext, 8, 4, 8)
+#if ARCH_X86_32
+HVBILIN(mmxext, 8, 8, 16)
+HVBILIN(mmxext, 8, 16, 16)
+#endif
+HVBILIN(sse2, 8, 8, 16)
+HVBILIN(sse2, 8, 16, 16)
+HVBILIN(ssse3, 8, 4, 8)
+HVBILIN(ssse3, 8, 8, 16)
+HVBILIN(ssse3, 8, 16, 16)
+
+void ff_vp8_idct_dc_add_mmx(uint8_t *dst, int16_t block[16],
+ ptrdiff_t stride);
+void ff_vp8_idct_dc_add_sse2(uint8_t *dst, int16_t block[16],
+ ptrdiff_t stride);
+void ff_vp8_idct_dc_add_sse4(uint8_t *dst, int16_t block[16],
+ ptrdiff_t stride);
+void ff_vp8_idct_dc_add4y_mmx(uint8_t *dst, int16_t block[4][16],
+ ptrdiff_t stride);
+void ff_vp8_idct_dc_add4y_sse2(uint8_t *dst, int16_t block[4][16],
+ ptrdiff_t stride);
+void ff_vp8_idct_dc_add4uv_mmx(uint8_t *dst, int16_t block[2][16],
+ ptrdiff_t stride);
+void ff_vp8_luma_dc_wht_mmx(int16_t block[4][4][16], int16_t dc[16]);
+void ff_vp8_luma_dc_wht_sse(int16_t block[4][4][16], int16_t dc[16]);
+void ff_vp8_idct_add_mmx(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
+void ff_vp8_idct_add_sse(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
+
+#define DECLARE_LOOP_FILTER(NAME) \
+void ff_vp8_v_loop_filter_simple_ ## NAME(uint8_t *dst, \
+ ptrdiff_t stride, \
+ int flim); \
+void ff_vp8_h_loop_filter_simple_ ## NAME(uint8_t *dst, \
+ ptrdiff_t stride, \
+ int flim); \
+void ff_vp8_v_loop_filter16y_inner_ ## NAME (uint8_t *dst, \
+ ptrdiff_t stride, \
+ int e, int i, int hvt); \
+void ff_vp8_h_loop_filter16y_inner_ ## NAME (uint8_t *dst, \
+ ptrdiff_t stride, \
+ int e, int i, int hvt); \
+void ff_vp8_v_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, \
+ uint8_t *dstV, \
+ ptrdiff_t s, \
+ int e, int i, int hvt); \
+void ff_vp8_h_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, \
+ uint8_t *dstV, \
+ ptrdiff_t s, \
+ int e, int i, int hvt); \
+void ff_vp8_v_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, \
+ ptrdiff_t stride, \
+ int e, int i, int hvt); \
+void ff_vp8_h_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, \
+ ptrdiff_t stride, \
+ int e, int i, int hvt); \
+void ff_vp8_v_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \
+ uint8_t *dstV, \
+ ptrdiff_t s, \
+ int e, int i, int hvt); \
+void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \
+ uint8_t *dstV, \
+ ptrdiff_t s, \
+ int e, int i, int hvt);
+
+DECLARE_LOOP_FILTER(mmx)
+DECLARE_LOOP_FILTER(mmxext)
+DECLARE_LOOP_FILTER(sse2)
+DECLARE_LOOP_FILTER(ssse3)
+DECLARE_LOOP_FILTER(sse4)
+
+#endif /* HAVE_X86ASM */
+
+#define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \
+ c->put_vp8_epel_pixels_tab[IDX][0][2] = ff_put_vp8_epel ## SIZE ## _h6_ ## OPT; \
+ c->put_vp8_epel_pixels_tab[IDX][2][0] = ff_put_vp8_epel ## SIZE ## _v6_ ## OPT; \
+ c->put_vp8_epel_pixels_tab[IDX][2][2] = ff_put_vp8_epel ## SIZE ## _h6v6_ ## OPT
+
+#define VP8_MC_FUNC(IDX, SIZE, OPT) \
+ c->put_vp8_epel_pixels_tab[IDX][0][1] = ff_put_vp8_epel ## SIZE ## _h4_ ## OPT; \
+ c->put_vp8_epel_pixels_tab[IDX][1][0] = ff_put_vp8_epel ## SIZE ## _v4_ ## OPT; \
+ c->put_vp8_epel_pixels_tab[IDX][1][1] = ff_put_vp8_epel ## SIZE ## _h4v4_ ## OPT; \
+ c->put_vp8_epel_pixels_tab[IDX][1][2] = ff_put_vp8_epel ## SIZE ## _h6v4_ ## OPT; \
+ c->put_vp8_epel_pixels_tab[IDX][2][1] = ff_put_vp8_epel ## SIZE ## _h4v6_ ## OPT; \
+ VP8_LUMA_MC_FUNC(IDX, SIZE, OPT)
+
+#define VP8_BILINEAR_MC_FUNC(IDX, SIZE, OPT) \
+ c->put_vp8_bilinear_pixels_tab[IDX][0][1] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
+ c->put_vp8_bilinear_pixels_tab[IDX][0][2] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
+ c->put_vp8_bilinear_pixels_tab[IDX][1][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
+ c->put_vp8_bilinear_pixels_tab[IDX][1][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
+ c->put_vp8_bilinear_pixels_tab[IDX][1][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
+ c->put_vp8_bilinear_pixels_tab[IDX][2][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
+ c->put_vp8_bilinear_pixels_tab[IDX][2][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
+ c->put_vp8_bilinear_pixels_tab[IDX][2][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT
+
+
+av_cold void ff_vp78dsp_init_x86(VP8DSPContext *c)
+{
+#if HAVE_X86ASM
+ int cpu_flags = av_get_cpu_flags();
+
+ if (EXTERNAL_MMX(cpu_flags)) {
+#if ARCH_X86_32
+ c->put_vp8_epel_pixels_tab[0][0][0] =
+ c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_mmx;
+#endif
+ c->put_vp8_epel_pixels_tab[1][0][0] =
+ c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx;
+ }
+
+ /* note that 4-tap width=16 functions are missing because w=16
+ * is only used for luma, and luma is always a copy or sixtap. */
+ if (EXTERNAL_MMXEXT(cpu_flags)) {
+ VP8_MC_FUNC(2, 4, mmxext);
+ VP8_BILINEAR_MC_FUNC(2, 4, mmxext);
+#if ARCH_X86_32
+ VP8_LUMA_MC_FUNC(0, 16, mmxext);
+ VP8_MC_FUNC(1, 8, mmxext);
+ VP8_BILINEAR_MC_FUNC(0, 16, mmxext);
+ VP8_BILINEAR_MC_FUNC(1, 8, mmxext);
+#endif
+ }
+
+ if (EXTERNAL_SSE(cpu_flags)) {
+ c->put_vp8_epel_pixels_tab[0][0][0] =
+ c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
+ }
+
+ if (EXTERNAL_SSE2(cpu_flags) || EXTERNAL_SSE2_SLOW(cpu_flags)) {
+ VP8_LUMA_MC_FUNC(0, 16, sse2);
+ VP8_MC_FUNC(1, 8, sse2);
+ VP8_BILINEAR_MC_FUNC(0, 16, sse2);
+ VP8_BILINEAR_MC_FUNC(1, 8, sse2);
+ }
+
+ if (EXTERNAL_SSSE3(cpu_flags)) {
+ VP8_LUMA_MC_FUNC(0, 16, ssse3);
+ VP8_MC_FUNC(1, 8, ssse3);
+ VP8_MC_FUNC(2, 4, ssse3);
+ VP8_BILINEAR_MC_FUNC(0, 16, ssse3);
+ VP8_BILINEAR_MC_FUNC(1, 8, ssse3);
+ VP8_BILINEAR_MC_FUNC(2, 4, ssse3);
+ }
+#endif /* HAVE_X86ASM */
+}
+
+av_cold void ff_vp8dsp_init_x86(VP8DSPContext *c)
+{
+#if HAVE_X86ASM
+ int cpu_flags = av_get_cpu_flags();
+
+ if (EXTERNAL_MMX(cpu_flags)) {
+ c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_mmx;
+#if ARCH_X86_32
+ c->vp8_idct_dc_add = ff_vp8_idct_dc_add_mmx;
+ c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_mmx;
+ c->vp8_idct_add = ff_vp8_idct_add_mmx;
+ c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_mmx;
+
+ c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmx;
+ c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmx;
+
+ c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmx;
+ c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmx;
+ c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmx;
+ c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmx;
+
+ c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmx;
+ c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmx;
+ c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmx;
+ c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmx;
+#endif
+ }
+
+ /* note that 4-tap width=16 functions are missing because w=16
+ * is only used for luma, and luma is always a copy or sixtap. */
+ if (EXTERNAL_MMXEXT(cpu_flags)) {
+#if ARCH_X86_32
+ c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmxext;
+ c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmxext;
+
+ c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmxext;
+ c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmxext;
+ c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmxext;
+ c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmxext;
+
+ c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmxext;
+ c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmxext;
+ c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmxext;
+ c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmxext;
+#endif
+ }
+
+ if (EXTERNAL_SSE(cpu_flags)) {
+ c->vp8_idct_add = ff_vp8_idct_add_sse;
+ c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_sse;
+ }
+
+ if (EXTERNAL_SSE2(cpu_flags) || EXTERNAL_SSE2_SLOW(cpu_flags)) {
+ c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2;
+
+ c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2;
+ c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2;
+
+ c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_sse2;
+ c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_sse2;
+ }
+
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse2;
+ c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_sse2;
+
+ c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2;
+
+ c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
+ c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
+
+ c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse2;
+ c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse2;
+ }
+
+ if (EXTERNAL_SSSE3(cpu_flags)) {
+ c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_ssse3;
+ c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_ssse3;
+
+ c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_ssse3;
+ c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_ssse3;
+ c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_ssse3;
+ c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_ssse3;
+
+ c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_ssse3;
+ c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_ssse3;
+ c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_ssse3;
+ c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_ssse3;
+ }
+
+ if (EXTERNAL_SSE4(cpu_flags)) {
+ c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4;
+
+ c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse4;
+ c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse4;
+ c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse4;
+ }
+#endif /* HAVE_X86ASM */
+}
diff --git a/libs/ffvpx/libavcodec/x86/vp8dsp_loopfilter.asm b/libs/ffvpx/libavcodec/x86/vp8dsp_loopfilter.asm
new file mode 100644
index 000000000..caeb40526
--- /dev/null
+++ b/libs/ffvpx/libavcodec/x86/vp8dsp_loopfilter.asm
@@ -0,0 +1,1584 @@
+;******************************************************************************
+;* VP8 MMXEXT optimizations
+;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
+;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pw_27: times 8 dw 27
+pw_63: times 8 dw 63
+
+pb_4: times 16 db 4
+pb_F8: times 16 db 0xF8
+pb_FE: times 16 db 0xFE
+pb_27_63: times 8 db 27, 63
+pb_18_63: times 8 db 18, 63
+pb_9_63: times 8 db 9, 63
+
+cextern pb_1
+cextern pb_3
+cextern pw_9
+cextern pw_18
+cextern pb_80
+
+SECTION .text
+
+;-----------------------------------------------------------------------------
+; void ff_vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, ptrdiff_t stride, int flim);
+;-----------------------------------------------------------------------------
+
+; macro called with 7 mm register indexes as argument, and 4 regular registers
+;
+; first 4 mm registers will carry the transposed pixel data
+; the other three are scratchspace (one would be sufficient, but this allows
+; for more spreading/pipelining and thus faster execution on OOE CPUs)
+;
+; first two regular registers are buf+4*stride and buf+5*stride
+; third is -stride, fourth is +stride
+%macro READ_8x4_INTERLEAVED 11
+ ; interleave 8 (A-H) rows of 4 pixels each
+ movd m%1, [%8+%10*4] ; A0-3
+ movd m%5, [%9+%10*4] ; B0-3
+ movd m%2, [%8+%10*2] ; C0-3
+ movd m%6, [%8+%10] ; D0-3
+ movd m%3, [%8] ; E0-3
+ movd m%7, [%9] ; F0-3
+ movd m%4, [%9+%11] ; G0-3
+ punpcklbw m%1, m%5 ; A/B interleaved
+ movd m%5, [%9+%11*2] ; H0-3
+ punpcklbw m%2, m%6 ; C/D interleaved
+ punpcklbw m%3, m%7 ; E/F interleaved
+ punpcklbw m%4, m%5 ; G/H interleaved
+%endmacro
+
+; macro called with 7 mm register indexes as argument, and 5 regular registers
+; first 11 mean the same as READ_8x4_TRANSPOSED above
+; fifth regular register is scratchspace to reach the bottom 8 rows, it
+; will be set to second regular register + 8*stride at the end
+%macro READ_16x4_INTERLEAVED 12
+ ; transpose 16 (A-P) rows of 4 pixels each
+ lea %12, [r0+8*r2]
+
+ ; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M
+ movd m%1, [%8+%10*4] ; A0-3
+ movd m%3, [%12+%10*4] ; I0-3
+ movd m%2, [%8+%10*2] ; C0-3
+ movd m%4, [%12+%10*2] ; K0-3
+ movd m%6, [%8+%10] ; D0-3
+ movd m%5, [%12+%10] ; L0-3
+ movd m%7, [%12] ; M0-3
+ add %12, %11
+ punpcklbw m%1, m%3 ; A/I
+ movd m%3, [%8] ; E0-3
+ punpcklbw m%2, m%4 ; C/K
+ punpcklbw m%6, m%5 ; D/L
+ punpcklbw m%3, m%7 ; E/M
+ punpcklbw m%2, m%6 ; C/D/K/L interleaved
+
+ ; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P
+ movd m%5, [%9+%10*4] ; B0-3
+ movd m%4, [%12+%10*4] ; J0-3
+ movd m%7, [%9] ; F0-3
+ movd m%6, [%12] ; N0-3
+ punpcklbw m%5, m%4 ; B/J
+ punpcklbw m%7, m%6 ; F/N
+ punpcklbw m%1, m%5 ; A/B/I/J interleaved
+ punpcklbw m%3, m%7 ; E/F/M/N interleaved
+ movd m%4, [%9+%11] ; G0-3
+ movd m%6, [%12+%11] ; O0-3
+ movd m%5, [%9+%11*2] ; H0-3
+ movd m%7, [%12+%11*2] ; P0-3
+ punpcklbw m%4, m%6 ; G/O
+ punpcklbw m%5, m%7 ; H/P
+ punpcklbw m%4, m%5 ; G/H/O/P interleaved
+%endmacro
+
+; write 4 mm registers of 2 dwords each
+; first four arguments are mm register indexes containing source data
+; last four are registers containing buf+4*stride, buf+5*stride,
+; -stride and +stride
+%macro WRITE_4x2D 8
+ ; write out (2 dwords per register)
+ movd [%5+%7*4], m%1
+ movd [%5+%7*2], m%2
+ movd [%5], m%3
+ movd [%6+%8], m%4
+ punpckhdq m%1, m%1
+ punpckhdq m%2, m%2
+ punpckhdq m%3, m%3
+ punpckhdq m%4, m%4
+ movd [%6+%7*4], m%1
+ movd [%5+%7], m%2
+ movd [%6], m%3
+ movd [%6+%8*2], m%4
+%endmacro
+
+; write 4 xmm registers of 4 dwords each
+; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular
+; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride
+; we add 1*stride to the third regular registry in the process
+; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the
+; same memory region), or 8 if they cover two separate buffers (third one points to
+; a different memory region than the first two), allowing for more optimal code for
+; the 16-width case
+%macro WRITE_4x4D 10
+ ; write out (4 dwords per register), start with dwords zero
+ movd [%5+%8*4], m%1
+ movd [%5], m%2
+ movd [%7+%8*4], m%3
+ movd [%7], m%4
+
+ ; store dwords 1
+ psrldq m%1, 4
+ psrldq m%2, 4
+ psrldq m%3, 4
+ psrldq m%4, 4
+ movd [%6+%8*4], m%1
+ movd [%6], m%2
+%if %10 == 16
+ movd [%6+%9*4], m%3
+%endif
+ movd [%7+%9], m%4
+
+ ; write dwords 2
+ psrldq m%1, 4
+ psrldq m%2, 4
+%if %10 == 8
+ movd [%5+%8*2], m%1
+ movd %5d, m%3
+%endif
+ psrldq m%3, 4
+ psrldq m%4, 4
+%if %10 == 16
+ movd [%5+%8*2], m%1
+%endif
+ movd [%6+%9], m%2
+ movd [%7+%8*2], m%3
+ movd [%7+%9*2], m%4
+ add %7, %9
+
+ ; store dwords 3
+ psrldq m%1, 4
+ psrldq m%2, 4
+ psrldq m%3, 4
+ psrldq m%4, 4
+%if %10 == 8
+ mov [%7+%8*4], %5d
+ movd [%6+%8*2], m%1
+%else
+ movd [%5+%8], m%1
+%endif
+ movd [%6+%9*2], m%2
+ movd [%7+%8*2], m%3
+ movd [%7+%9*2], m%4
+%endmacro
+
+; write 4 or 8 words in the mmx/xmm registers as 8 lines
+; 1 and 2 are the registers to write, this can be the same (for SSE2)
+; for pre-SSE4:
+; 3 is a general-purpose register that we will clobber
+; for SSE4:
+; 3 is a pointer to the destination's 5th line
+; 4 is a pointer to the destination's 4th line
+; 5/6 is -stride and +stride
+%macro WRITE_2x4W 6
+ movd %3d, %1
+ punpckhdq %1, %1
+ mov [%4+%5*4], %3w
+ shr %3, 16
+ add %4, %6
+ mov [%4+%5*4], %3w
+
+ movd %3d, %1
+ add %4, %5
+ mov [%4+%5*2], %3w
+ shr %3, 16
+ mov [%4+%5 ], %3w
+
+ movd %3d, %2
+ punpckhdq %2, %2
+ mov [%4 ], %3w
+ shr %3, 16
+ mov [%4+%6 ], %3w
+
+ movd %3d, %2
+ add %4, %6
+ mov [%4+%6 ], %3w
+ shr %3, 16
+ mov [%4+%6*2], %3w
+ add %4, %5
+%endmacro
+
+%macro WRITE_8W 5
+%if cpuflag(sse4)
+ pextrw [%3+%4*4], %1, 0
+ pextrw [%2+%4*4], %1, 1
+ pextrw [%3+%4*2], %1, 2
+ pextrw [%3+%4 ], %1, 3
+ pextrw [%3 ], %1, 4
+ pextrw [%2 ], %1, 5
+ pextrw [%2+%5 ], %1, 6
+ pextrw [%2+%5*2], %1, 7
+%else
+ movd %2d, %1
+ psrldq %1, 4
+ mov [%3+%4*4], %2w
+ shr %2, 16
+ add %3, %5
+ mov [%3+%4*4], %2w
+
+ movd %2d, %1
+ psrldq %1, 4
+ add %3, %4
+ mov [%3+%4*2], %2w
+ shr %2, 16
+ mov [%3+%4 ], %2w
+
+ movd %2d, %1
+ psrldq %1, 4
+ mov [%3 ], %2w
+ shr %2, 16
+ mov [%3+%5 ], %2w
+
+ movd %2d, %1
+ add %3, %5
+ mov [%3+%5 ], %2w
+ shr %2, 16
+ mov [%3+%5*2], %2w
+%endif
+%endmacro
+
+%macro SIMPLE_LOOPFILTER 2
+cglobal vp8_%1_loop_filter_simple, 3, %2, 8, dst, stride, flim, cntr
+%if mmsize == 8 ; mmx/mmxext
+ mov cntrq, 2
+%endif
+%if cpuflag(ssse3)
+ pxor m0, m0
+%endif
+ SPLATB_REG m7, flim, m0 ; splat "flim" into register
+
+ ; set up indexes to address 4 rows
+%if mmsize == 8
+ DEFINE_ARGS dst1, mstride, stride, cntr, dst2
+%else
+ DEFINE_ARGS dst1, mstride, stride, dst3, dst2
+%endif
+ mov strideq, mstrideq
+ neg mstrideq
+%ifidn %1, h
+ lea dst1q, [dst1q+4*strideq-2]
+%endif
+
+%if mmsize == 8 ; mmx / mmxext
+.next8px:
+%endif
+%ifidn %1, v
+ ; read 4 half/full rows of pixels
+ mova m0, [dst1q+mstrideq*2] ; p1
+ mova m1, [dst1q+mstrideq] ; p0
+ mova m2, [dst1q] ; q0
+ mova m3, [dst1q+ strideq] ; q1
+%else ; h
+ lea dst2q, [dst1q+ strideq]
+
+%if mmsize == 8 ; mmx/mmxext
+ READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq
+%else ; sse2
+ READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq, dst3q
+%endif
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+%endif
+
+ ; simple_limit
+ mova m5, m2 ; m5=backup of q0
+ mova m6, m1 ; m6=backup of p0
+ psubusb m1, m2 ; p0-q0
+ psubusb m2, m6 ; q0-p0
+ por m1, m2 ; FFABS(p0-q0)
+ paddusb m1, m1 ; m1=FFABS(p0-q0)*2
+
+ mova m4, m3
+ mova m2, m0
+ psubusb m3, m0 ; q1-p1
+ psubusb m0, m4 ; p1-q1
+ por m3, m0 ; FFABS(p1-q1)
+ mova m0, [pb_80]
+ pxor m2, m0
+ pxor m4, m0
+ psubsb m2, m4 ; m2=p1-q1 (signed) backup for below
+ pand m3, [pb_FE]
+ psrlq m3, 1 ; m3=FFABS(p1-q1)/2, this can be used signed
+ paddusb m3, m1
+ psubusb m3, m7
+ pxor m1, m1
+ pcmpeqb m3, m1 ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0)
+
+ ; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask)
+ mova m4, m5
+ pxor m5, m0
+ pxor m0, m6
+ psubsb m5, m0 ; q0-p0 (signed)
+ paddsb m2, m5
+ paddsb m2, m5
+ paddsb m2, m5 ; a=(p1-q1) + 3*(q0-p0)
+ pand m2, m3 ; apply filter mask (m3)
+
+ mova m3, [pb_F8]
+ mova m1, m2
+ paddsb m2, [pb_4] ; f1<<3=a+4
+ paddsb m1, [pb_3] ; f2<<3=a+3
+ pand m2, m3
+ pand m1, m3 ; cache f2<<3
+
+ pxor m0, m0
+ pxor m3, m3
+ pcmpgtb m0, m2 ; which values are <0?
+ psubb m3, m2 ; -f1<<3
+ psrlq m2, 3 ; +f1
+ psrlq m3, 3 ; -f1
+ pand m3, m0
+ pandn m0, m2
+ psubusb m4, m0
+ paddusb m4, m3 ; q0-f1
+
+ pxor m0, m0
+ pxor m3, m3
+ pcmpgtb m0, m1 ; which values are <0?
+ psubb m3, m1 ; -f2<<3
+ psrlq m1, 3 ; +f2
+ psrlq m3, 3 ; -f2
+ pand m3, m0
+ pandn m0, m1
+ paddusb m6, m0
+ psubusb m6, m3 ; p0+f2
+
+ ; store
+%ifidn %1, v
+ mova [dst1q], m4
+ mova [dst1q+mstrideq], m6
+%else ; h
+ inc dst1q
+ SBUTTERFLY bw, 6, 4, 0
+
+%if mmsize == 16 ; sse2
+%if cpuflag(sse4)
+ inc dst2q
+%endif
+ WRITE_8W m6, dst2q, dst1q, mstrideq, strideq
+ lea dst2q, [dst3q+mstrideq+1]
+%if cpuflag(sse4)
+ inc dst3q
+%endif
+ WRITE_8W m4, dst3q, dst2q, mstrideq, strideq
+%else ; mmx/mmxext
+ WRITE_2x4W m6, m4, dst2q, dst1q, mstrideq, strideq
+%endif
+%endif
+
+%if mmsize == 8 ; mmx/mmxext
+ ; next 8 pixels
+%ifidn %1, v
+ add dst1q, 8 ; advance 8 cols = pixels
+%else ; h
+ lea dst1q, [dst1q+strideq*8-1] ; advance 8 rows = lines
+%endif
+ dec cntrq
+ jg .next8px
+ REP_RET
+%else ; sse2
+ RET
+%endif
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+SIMPLE_LOOPFILTER v, 4
+SIMPLE_LOOPFILTER h, 5
+INIT_MMX mmxext
+SIMPLE_LOOPFILTER v, 4
+SIMPLE_LOOPFILTER h, 5
+%endif
+
+INIT_XMM sse2
+SIMPLE_LOOPFILTER v, 3
+SIMPLE_LOOPFILTER h, 5
+INIT_XMM ssse3
+SIMPLE_LOOPFILTER v, 3
+SIMPLE_LOOPFILTER h, 5
+INIT_XMM sse4
+SIMPLE_LOOPFILTER h, 5
+
+;-----------------------------------------------------------------------------
+; void ff_vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] ptrdiff_t stride,
+; int flimE, int flimI, int hev_thr);
+;-----------------------------------------------------------------------------
+
+%macro INNER_LOOPFILTER 2
+%define stack_size 0
+%ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
+%ifidn %1, v ; [3]=hev() result
+%define stack_size mmsize * -4
+%else ; h ; extra storage space for transposes
+%define stack_size mmsize * -5
+%endif
+%endif
+
+%if %2 == 8 ; chroma
+cglobal vp8_%1_loop_filter8uv_inner, 6, 6, 13, stack_size, dst, dst8, stride, flimE, flimI, hevthr
+%else ; luma
+cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, stack_size, dst, stride, flimE, flimI, hevthr
+%endif
+
+%if cpuflag(ssse3)
+ pxor m7, m7
+%endif
+
+%ifndef m8
+ ; splat function arguments
+ SPLATB_REG m0, flimEq, m7 ; E
+ SPLATB_REG m1, flimIq, m7 ; I
+ SPLATB_REG m2, hevthrq, m7 ; hev_thresh
+
+%define m_flimE [rsp]
+%define m_flimI [rsp+mmsize]
+%define m_hevthr [rsp+mmsize*2]
+%define m_maskres [rsp+mmsize*3]
+%define m_p0backup [rsp+mmsize*3]
+%define m_q0backup [rsp+mmsize*4]
+
+ mova m_flimE, m0
+ mova m_flimI, m1
+ mova m_hevthr, m2
+%else
+%define m_flimE m9
+%define m_flimI m10
+%define m_hevthr m11
+%define m_maskres m12
+%define m_p0backup m12
+%define m_q0backup m8
+
+ ; splat function arguments
+ SPLATB_REG m_flimE, flimEq, m7 ; E
+ SPLATB_REG m_flimI, flimIq, m7 ; I
+ SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh
+%endif
+
+%if %2 == 8 ; chroma
+ DEFINE_ARGS dst1, dst8, mstride, stride, dst2
+%elif mmsize == 8
+ DEFINE_ARGS dst1, mstride, stride, dst2, cntr
+ mov cntrq, 2
+%else
+ DEFINE_ARGS dst1, mstride, stride, dst2, dst8
+%endif
+ mov strideq, mstrideq
+ neg mstrideq
+%ifidn %1, h
+ lea dst1q, [dst1q+strideq*4-4]
+%if %2 == 8 ; chroma
+ lea dst8q, [dst8q+strideq*4-4]
+%endif
+%endif
+
+%if mmsize == 8
+.next8px:
+%endif
+ ; read
+ lea dst2q, [dst1q+strideq]
+%ifidn %1, v
+%if %2 == 8 && mmsize == 16
+%define movrow movh
+%else
+%define movrow mova
+%endif
+ movrow m0, [dst1q+mstrideq*4] ; p3
+ movrow m1, [dst2q+mstrideq*4] ; p2
+ movrow m2, [dst1q+mstrideq*2] ; p1
+ movrow m5, [dst2q] ; q1
+ movrow m6, [dst2q+ strideq*1] ; q2
+ movrow m7, [dst2q+ strideq*2] ; q3
+%if mmsize == 16 && %2 == 8
+ movhps m0, [dst8q+mstrideq*4]
+ movhps m2, [dst8q+mstrideq*2]
+ add dst8q, strideq
+ movhps m1, [dst8q+mstrideq*4]
+ movhps m5, [dst8q]
+ movhps m6, [dst8q+ strideq ]
+ movhps m7, [dst8q+ strideq*2]
+ add dst8q, mstrideq
+%endif
+%elif mmsize == 8 ; mmx/mmxext (h)
+ ; read 8 rows of 8px each
+ movu m0, [dst1q+mstrideq*4]
+ movu m1, [dst2q+mstrideq*4]
+ movu m2, [dst1q+mstrideq*2]
+ movu m3, [dst1q+mstrideq ]
+ movu m4, [dst1q]
+ movu m5, [dst2q]
+ movu m6, [dst2q+ strideq ]
+
+ ; 8x8 transpose
+ TRANSPOSE4x4B 0, 1, 2, 3, 7
+ mova m_q0backup, m1
+ movu m7, [dst2q+ strideq*2]
+ TRANSPOSE4x4B 4, 5, 6, 7, 1
+ SBUTTERFLY dq, 0, 4, 1 ; p3/p2
+ SBUTTERFLY dq, 2, 6, 1 ; q0/q1
+ SBUTTERFLY dq, 3, 7, 1 ; q2/q3
+ mova m1, m_q0backup
+ mova m_q0backup, m2 ; store q0
+ SBUTTERFLY dq, 1, 5, 2 ; p1/p0
+ mova m_p0backup, m5 ; store p0
+ SWAP 1, 4
+ SWAP 2, 4
+ SWAP 6, 3
+ SWAP 5, 3
+%else ; sse2 (h)
+%if %2 == 16
+ lea dst8q, [dst1q+ strideq*8]
+%endif
+
+ ; read 16 rows of 8px each, interleave
+ movh m0, [dst1q+mstrideq*4]
+ movh m1, [dst8q+mstrideq*4]
+ movh m2, [dst1q+mstrideq*2]
+ movh m5, [dst8q+mstrideq*2]
+ movh m3, [dst1q+mstrideq ]
+ movh m6, [dst8q+mstrideq ]
+ movh m4, [dst1q]
+ movh m7, [dst8q]
+ punpcklbw m0, m1 ; A/I
+ punpcklbw m2, m5 ; C/K
+ punpcklbw m3, m6 ; D/L
+ punpcklbw m4, m7 ; E/M
+
+ add dst8q, strideq
+ movh m1, [dst2q+mstrideq*4]
+ movh m6, [dst8q+mstrideq*4]
+ movh m5, [dst2q]
+ movh m7, [dst8q]
+ punpcklbw m1, m6 ; B/J
+ punpcklbw m5, m7 ; F/N
+ movh m6, [dst2q+ strideq ]
+ movh m7, [dst8q+ strideq ]
+ punpcklbw m6, m7 ; G/O
+
+ ; 8x16 transpose
+ TRANSPOSE4x4B 0, 1, 2, 3, 7
+%ifdef m8
+ SWAP 1, 8
+%else
+ mova m_q0backup, m1
+%endif
+ movh m7, [dst2q+ strideq*2]
+ movh m1, [dst8q+ strideq*2]
+ punpcklbw m7, m1 ; H/P
+ TRANSPOSE4x4B 4, 5, 6, 7, 1
+ SBUTTERFLY dq, 0, 4, 1 ; p3/p2
+ SBUTTERFLY dq, 2, 6, 1 ; q0/q1
+ SBUTTERFLY dq, 3, 7, 1 ; q2/q3
+%ifdef m8
+ SWAP 1, 8
+ SWAP 2, 8
+%else
+ mova m1, m_q0backup
+ mova m_q0backup, m2 ; store q0
+%endif
+ SBUTTERFLY dq, 1, 5, 2 ; p1/p0
+%ifdef m12
+ SWAP 5, 12
+%else
+ mova m_p0backup, m5 ; store p0
+%endif
+ SWAP 1, 4
+ SWAP 2, 4
+ SWAP 6, 3
+ SWAP 5, 3
+%endif
+
+ ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
+ mova m4, m1
+ SWAP 4, 1
+ psubusb m4, m0 ; p2-p3
+ psubusb m0, m1 ; p3-p2
+ por m0, m4 ; abs(p3-p2)
+
+ mova m4, m2
+ SWAP 4, 2
+ psubusb m4, m1 ; p1-p2
+ psubusb m1, m2 ; p2-p1
+ por m1, m4 ; abs(p2-p1)
+
+ mova m4, m6
+ SWAP 4, 6
+ psubusb m4, m7 ; q2-q3
+ psubusb m7, m6 ; q3-q2
+ por m7, m4 ; abs(q3-q2)
+
+ mova m4, m5
+ SWAP 4, 5
+ psubusb m4, m6 ; q1-q2
+ psubusb m6, m5 ; q2-q1
+ por m6, m4 ; abs(q2-q1)
+
+%if notcpuflag(mmxext)
+ mova m4, m_flimI
+ pxor m3, m3
+ psubusb m0, m4
+ psubusb m1, m4
+ psubusb m7, m4
+ psubusb m6, m4
+ pcmpeqb m0, m3 ; abs(p3-p2) <= I
+ pcmpeqb m1, m3 ; abs(p2-p1) <= I
+ pcmpeqb m7, m3 ; abs(q3-q2) <= I
+ pcmpeqb m6, m3 ; abs(q2-q1) <= I
+ pand m0, m1
+ pand m7, m6
+ pand m0, m7
+%else ; mmxext/sse2
+ pmaxub m0, m1
+ pmaxub m6, m7
+ pmaxub m0, m6
+%endif
+
+ ; normal_limit and high_edge_variance for p1-p0, q1-q0
+ SWAP 7, 3 ; now m7 is zero
+%ifidn %1, v
+ movrow m3, [dst1q+mstrideq ] ; p0
+%if mmsize == 16 && %2 == 8
+ movhps m3, [dst8q+mstrideq ]
+%endif
+%elifdef m12
+ SWAP 3, 12
+%else
+ mova m3, m_p0backup
+%endif
+
+ mova m1, m2
+ SWAP 1, 2
+ mova m6, m3
+ SWAP 3, 6
+ psubusb m1, m3 ; p1-p0
+ psubusb m6, m2 ; p0-p1
+ por m1, m6 ; abs(p1-p0)
+%if notcpuflag(mmxext)
+ mova m6, m1
+ psubusb m1, m4
+ psubusb m6, m_hevthr
+ pcmpeqb m1, m7 ; abs(p1-p0) <= I
+ pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh
+ pand m0, m1
+ mova m_maskres, m6
+%else ; mmxext/sse2
+ pmaxub m0, m1 ; max_I
+ SWAP 1, 4 ; max_hev_thresh
+%endif
+
+ SWAP 6, 4 ; now m6 is I
+%ifidn %1, v
+ movrow m4, [dst1q] ; q0
+%if mmsize == 16 && %2 == 8
+ movhps m4, [dst8q]
+%endif
+%elifdef m8
+ SWAP 4, 8
+%else
+ mova m4, m_q0backup
+%endif
+ mova m1, m4
+ SWAP 1, 4
+ mova m7, m5
+ SWAP 7, 5
+ psubusb m1, m5 ; q0-q1
+ psubusb m7, m4 ; q1-q0
+ por m1, m7 ; abs(q1-q0)
+%if notcpuflag(mmxext)
+ mova m7, m1
+ psubusb m1, m6
+ psubusb m7, m_hevthr
+ pxor m6, m6
+ pcmpeqb m1, m6 ; abs(q1-q0) <= I
+ pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh
+ mova m6, m_maskres
+ pand m0, m1 ; abs([pq][321]-[pq][210]) <= I
+ pand m6, m7
+%else ; mmxext/sse2
+ pxor m7, m7
+ pmaxub m0, m1
+ pmaxub m6, m1
+ psubusb m0, m_flimI
+ psubusb m6, m_hevthr
+ pcmpeqb m0, m7 ; max(abs(..)) <= I
+ pcmpeqb m6, m7 ; !(max(abs..) > thresh)
+%endif
+%ifdef m12
+ SWAP 6, 12
+%else
+ mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
+%endif
+
+ ; simple_limit
+ mova m1, m3
+ SWAP 1, 3
+ mova m6, m4 ; keep copies of p0/q0 around for later use
+ SWAP 6, 4
+ psubusb m1, m4 ; p0-q0
+ psubusb m6, m3 ; q0-p0
+ por m1, m6 ; abs(q0-p0)
+ paddusb m1, m1 ; m1=2*abs(q0-p0)
+
+ mova m7, m2
+ SWAP 7, 2
+ mova m6, m5
+ SWAP 6, 5
+ psubusb m7, m5 ; p1-q1
+ psubusb m6, m2 ; q1-p1
+ por m7, m6 ; abs(q1-p1)
+ pxor m6, m6
+ pand m7, [pb_FE]
+ psrlq m7, 1 ; abs(q1-p1)/2
+ paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2
+ psubusb m7, m_flimE
+ pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
+ pand m0, m7 ; normal_limit result
+
+ ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
+%ifdef m8 ; x86-64 && sse2
+ mova m8, [pb_80]
+%define m_pb_80 m8
+%else ; x86-32 or mmx/mmxext
+%define m_pb_80 [pb_80]
+%endif
+ mova m1, m4
+ mova m7, m3
+ pxor m1, m_pb_80
+ pxor m7, m_pb_80
+ psubsb m1, m7 ; (signed) q0-p0
+ mova m6, m2
+ mova m7, m5
+ pxor m6, m_pb_80
+ pxor m7, m_pb_80
+ psubsb m6, m7 ; (signed) p1-q1
+ mova m7, m_maskres
+ pandn m7, m6
+ paddsb m7, m1
+ paddsb m7, m1
+ paddsb m7, m1 ; 3*(q0-p0)+is4tap?(p1-q1)
+
+ pand m7, m0
+ mova m1, [pb_F8]
+ mova m6, m7
+ paddsb m7, [pb_3]
+ paddsb m6, [pb_4]
+ pand m7, m1
+ pand m6, m1
+
+ pxor m1, m1
+ pxor m0, m0
+ pcmpgtb m1, m7
+ psubb m0, m7
+ psrlq m7, 3 ; +f2
+ psrlq m0, 3 ; -f2
+ pand m0, m1
+ pandn m1, m7
+ psubusb m3, m0
+ paddusb m3, m1 ; p0+f2
+
+ pxor m1, m1
+ pxor m0, m0
+ pcmpgtb m0, m6
+ psubb m1, m6
+ psrlq m6, 3 ; +f1
+ psrlq m1, 3 ; -f1
+ pand m1, m0
+ pandn m0, m6
+ psubusb m4, m0
+ paddusb m4, m1 ; q0-f1
+
+%ifdef m12
+ SWAP 6, 12
+%else
+ mova m6, m_maskres
+%endif
+%if notcpuflag(mmxext)
+ mova m7, [pb_1]
+%else ; mmxext/sse2
+ pxor m7, m7
+%endif
+ pand m0, m6
+ pand m1, m6
+%if notcpuflag(mmxext)
+ paddusb m0, m7
+ pand m1, [pb_FE]
+ pandn m7, m0
+ psrlq m1, 1
+ psrlq m7, 1
+ SWAP 0, 7
+%else ; mmxext/sse2
+ psubusb m1, [pb_1]
+ pavgb m0, m7 ; a
+ pavgb m1, m7 ; -a
+%endif
+ psubusb m5, m0
+ psubusb m2, m1
+ paddusb m5, m1 ; q1-a
+ paddusb m2, m0 ; p1+a
+
+ ; store
+%ifidn %1, v
+ movrow [dst1q+mstrideq*2], m2
+ movrow [dst1q+mstrideq ], m3
+ movrow [dst1q], m4
+ movrow [dst1q+ strideq ], m5
+%if mmsize == 16 && %2 == 8
+ movhps [dst8q+mstrideq*2], m2
+ movhps [dst8q+mstrideq ], m3
+ movhps [dst8q], m4
+ movhps [dst8q+ strideq ], m5
+%endif
+%else ; h
+ add dst1q, 2
+ add dst2q, 2
+
+ ; 4x8/16 transpose
+ TRANSPOSE4x4B 2, 3, 4, 5, 6
+
+%if mmsize == 8 ; mmx/mmxext (h)
+ WRITE_4x2D 2, 3, 4, 5, dst1q, dst2q, mstrideq, strideq
+%else ; sse2 (h)
+ lea dst8q, [dst8q+mstrideq +2]
+ WRITE_4x4D 2, 3, 4, 5, dst1q, dst2q, dst8q, mstrideq, strideq, %2
+%endif
+%endif
+
+%if mmsize == 8
+%if %2 == 8 ; chroma
+%ifidn %1, h
+ sub dst1q, 2
+%endif
+ cmp dst1q, dst8q
+ mov dst1q, dst8q
+ jnz .next8px
+%else
+%ifidn %1, h
+ lea dst1q, [dst1q+ strideq*8-2]
+%else ; v
+ add dst1q, 8
+%endif
+ dec cntrq
+ jg .next8px
+%endif
+ REP_RET
+%else ; mmsize == 16
+ RET
+%endif
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+INNER_LOOPFILTER v, 16
+INNER_LOOPFILTER h, 16
+INNER_LOOPFILTER v, 8
+INNER_LOOPFILTER h, 8
+
+INIT_MMX mmxext
+INNER_LOOPFILTER v, 16
+INNER_LOOPFILTER h, 16
+INNER_LOOPFILTER v, 8
+INNER_LOOPFILTER h, 8
+%endif
+
+INIT_XMM sse2
+INNER_LOOPFILTER v, 16
+INNER_LOOPFILTER h, 16
+INNER_LOOPFILTER v, 8
+INNER_LOOPFILTER h, 8
+
+INIT_XMM ssse3
+INNER_LOOPFILTER v, 16
+INNER_LOOPFILTER h, 16
+INNER_LOOPFILTER v, 8
+INNER_LOOPFILTER h, 8
+
+;-----------------------------------------------------------------------------
+; void ff_vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] ptrdiff_t stride,
+; int flimE, int flimI, int hev_thr);
+;-----------------------------------------------------------------------------
+
+%macro MBEDGE_LOOPFILTER 2
+%define stack_size 0
+%ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
+%if mmsize == 16 ; [3]=hev() result
+ ; [4]=filter tmp result
+ ; [5]/[6] = p2/q2 backup
+ ; [7]=lim_res sign result
+%define stack_size mmsize * -7
+%else ; 8 ; extra storage space for transposes
+%define stack_size mmsize * -8
+%endif
+%endif
+
+%if %2 == 8 ; chroma
+cglobal vp8_%1_loop_filter8uv_mbedge, 6, 6, 15, stack_size, dst1, dst8, stride, flimE, flimI, hevthr
+%else ; luma
+cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, stack_size, dst1, stride, flimE, flimI, hevthr
+%endif
+
+%if cpuflag(ssse3)
+ pxor m7, m7
+%endif
+
+%ifndef m8
+ ; splat function arguments
+ SPLATB_REG m0, flimEq, m7 ; E
+ SPLATB_REG m1, flimIq, m7 ; I
+ SPLATB_REG m2, hevthrq, m7 ; hev_thresh
+
+%define m_flimE [rsp]
+%define m_flimI [rsp+mmsize]
+%define m_hevthr [rsp+mmsize*2]
+%define m_maskres [rsp+mmsize*3]
+%define m_limres [rsp+mmsize*4]
+%define m_p0backup [rsp+mmsize*3]
+%define m_q0backup [rsp+mmsize*4]
+%define m_p2backup [rsp+mmsize*5]
+%define m_q2backup [rsp+mmsize*6]
+%if mmsize == 16
+%define m_limsign [rsp]
+%else
+%define m_limsign [rsp+mmsize*7]
+%endif
+
+ mova m_flimE, m0
+ mova m_flimI, m1
+ mova m_hevthr, m2
+%else ; sse2 on x86-64
+%define m_flimE m9
+%define m_flimI m10
+%define m_hevthr m11
+%define m_maskres m12
+%define m_limres m8
+%define m_p0backup m12
+%define m_q0backup m8
+%define m_p2backup m13
+%define m_q2backup m14
+%define m_limsign m9
+
+ ; splat function arguments
+ SPLATB_REG m_flimE, flimEq, m7 ; E
+ SPLATB_REG m_flimI, flimIq, m7 ; I
+ SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh
+%endif
+
+%if %2 == 8 ; chroma
+ DEFINE_ARGS dst1, dst8, mstride, stride, dst2
+%elif mmsize == 8
+ DEFINE_ARGS dst1, mstride, stride, dst2, cntr
+ mov cntrq, 2
+%else
+ DEFINE_ARGS dst1, mstride, stride, dst2, dst8
+%endif
+ mov strideq, mstrideq
+ neg mstrideq
+%ifidn %1, h
+ lea dst1q, [dst1q+strideq*4-4]
+%if %2 == 8 ; chroma
+ lea dst8q, [dst8q+strideq*4-4]
+%endif
+%endif
+
+%if mmsize == 8
+.next8px:
+%endif
+ ; read
+ lea dst2q, [dst1q+ strideq ]
+%ifidn %1, v
+%if %2 == 8 && mmsize == 16
+%define movrow movh
+%else
+%define movrow mova
+%endif
+ movrow m0, [dst1q+mstrideq*4] ; p3
+ movrow m1, [dst2q+mstrideq*4] ; p2
+ movrow m2, [dst1q+mstrideq*2] ; p1
+ movrow m5, [dst2q] ; q1
+ movrow m6, [dst2q+ strideq ] ; q2
+ movrow m7, [dst2q+ strideq*2] ; q3
+%if mmsize == 16 && %2 == 8
+ movhps m0, [dst8q+mstrideq*4]
+ movhps m2, [dst8q+mstrideq*2]
+ add dst8q, strideq
+ movhps m1, [dst8q+mstrideq*4]
+ movhps m5, [dst8q]
+ movhps m6, [dst8q+ strideq ]
+ movhps m7, [dst8q+ strideq*2]
+ add dst8q, mstrideq
+%endif
+%elif mmsize == 8 ; mmx/mmxext (h)
+ ; read 8 rows of 8px each
+ movu m0, [dst1q+mstrideq*4]
+ movu m1, [dst2q+mstrideq*4]
+ movu m2, [dst1q+mstrideq*2]
+ movu m3, [dst1q+mstrideq ]
+ movu m4, [dst1q]
+ movu m5, [dst2q]
+ movu m6, [dst2q+ strideq ]
+
+ ; 8x8 transpose
+ TRANSPOSE4x4B 0, 1, 2, 3, 7
+ mova m_q0backup, m1
+ movu m7, [dst2q+ strideq*2]
+ TRANSPOSE4x4B 4, 5, 6, 7, 1
+ SBUTTERFLY dq, 0, 4, 1 ; p3/p2
+ SBUTTERFLY dq, 2, 6, 1 ; q0/q1
+ SBUTTERFLY dq, 3, 7, 1 ; q2/q3
+ mova m1, m_q0backup
+ mova m_q0backup, m2 ; store q0
+ SBUTTERFLY dq, 1, 5, 2 ; p1/p0
+ mova m_p0backup, m5 ; store p0
+ SWAP 1, 4
+ SWAP 2, 4
+ SWAP 6, 3
+ SWAP 5, 3
+%else ; sse2 (h)
+%if %2 == 16
+ lea dst8q, [dst1q+ strideq*8 ]
+%endif
+
+ ; read 16 rows of 8px each, interleave
+ movh m0, [dst1q+mstrideq*4]
+ movh m1, [dst8q+mstrideq*4]
+ movh m2, [dst1q+mstrideq*2]
+ movh m5, [dst8q+mstrideq*2]
+ movh m3, [dst1q+mstrideq ]
+ movh m6, [dst8q+mstrideq ]
+ movh m4, [dst1q]
+ movh m7, [dst8q]
+ punpcklbw m0, m1 ; A/I
+ punpcklbw m2, m5 ; C/K
+ punpcklbw m3, m6 ; D/L
+ punpcklbw m4, m7 ; E/M
+
+ add dst8q, strideq
+ movh m1, [dst2q+mstrideq*4]
+ movh m6, [dst8q+mstrideq*4]
+ movh m5, [dst2q]
+ movh m7, [dst8q]
+ punpcklbw m1, m6 ; B/J
+ punpcklbw m5, m7 ; F/N
+ movh m6, [dst2q+ strideq ]
+ movh m7, [dst8q+ strideq ]
+ punpcklbw m6, m7 ; G/O
+
+ ; 8x16 transpose
+ TRANSPOSE4x4B 0, 1, 2, 3, 7
+%ifdef m8
+ SWAP 1, 8
+%else
+ mova m_q0backup, m1
+%endif
+ movh m7, [dst2q+ strideq*2]
+ movh m1, [dst8q+ strideq*2]
+ punpcklbw m7, m1 ; H/P
+ TRANSPOSE4x4B 4, 5, 6, 7, 1
+ SBUTTERFLY dq, 0, 4, 1 ; p3/p2
+ SBUTTERFLY dq, 2, 6, 1 ; q0/q1
+ SBUTTERFLY dq, 3, 7, 1 ; q2/q3
+%ifdef m8
+ SWAP 1, 8
+ SWAP 2, 8
+%else
+ mova m1, m_q0backup
+ mova m_q0backup, m2 ; store q0
+%endif
+ SBUTTERFLY dq, 1, 5, 2 ; p1/p0
+%ifdef m12
+ SWAP 5, 12
+%else
+ mova m_p0backup, m5 ; store p0
+%endif
+ SWAP 1, 4
+ SWAP 2, 4
+ SWAP 6, 3
+ SWAP 5, 3
+%endif
+
+ ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
+ mova m4, m1
+ SWAP 4, 1
+ psubusb m4, m0 ; p2-p3
+ psubusb m0, m1 ; p3-p2
+ por m0, m4 ; abs(p3-p2)
+
+ mova m4, m2
+ SWAP 4, 2
+ psubusb m4, m1 ; p1-p2
+ mova m_p2backup, m1
+ psubusb m1, m2 ; p2-p1
+ por m1, m4 ; abs(p2-p1)
+
+ mova m4, m6
+ SWAP 4, 6
+ psubusb m4, m7 ; q2-q3
+ psubusb m7, m6 ; q3-q2
+ por m7, m4 ; abs(q3-q2)
+
+ mova m4, m5
+ SWAP 4, 5
+ psubusb m4, m6 ; q1-q2
+ mova m_q2backup, m6
+ psubusb m6, m5 ; q2-q1
+ por m6, m4 ; abs(q2-q1)
+
+%if notcpuflag(mmxext)
+ mova m4, m_flimI
+ pxor m3, m3
+ psubusb m0, m4
+ psubusb m1, m4
+ psubusb m7, m4
+ psubusb m6, m4
+ pcmpeqb m0, m3 ; abs(p3-p2) <= I
+ pcmpeqb m1, m3 ; abs(p2-p1) <= I
+ pcmpeqb m7, m3 ; abs(q3-q2) <= I
+ pcmpeqb m6, m3 ; abs(q2-q1) <= I
+ pand m0, m1
+ pand m7, m6
+ pand m0, m7
+%else ; mmxext/sse2
+ pmaxub m0, m1
+ pmaxub m6, m7
+ pmaxub m0, m6
+%endif
+
+ ; normal_limit and high_edge_variance for p1-p0, q1-q0
+ SWAP 7, 3 ; now m7 is zero
+%ifidn %1, v
+ movrow m3, [dst1q+mstrideq ] ; p0
+%if mmsize == 16 && %2 == 8
+ movhps m3, [dst8q+mstrideq ]
+%endif
+%elifdef m12
+ SWAP 3, 12
+%else
+ mova m3, m_p0backup
+%endif
+
+ mova m1, m2
+ SWAP 1, 2
+ mova m6, m3
+ SWAP 3, 6
+ psubusb m1, m3 ; p1-p0
+ psubusb m6, m2 ; p0-p1
+ por m1, m6 ; abs(p1-p0)
+%if notcpuflag(mmxext)
+ mova m6, m1
+ psubusb m1, m4
+ psubusb m6, m_hevthr
+ pcmpeqb m1, m7 ; abs(p1-p0) <= I
+ pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh
+ pand m0, m1
+ mova m_maskres, m6
+%else ; mmxext/sse2
+ pmaxub m0, m1 ; max_I
+ SWAP 1, 4 ; max_hev_thresh
+%endif
+
+ SWAP 6, 4 ; now m6 is I
+%ifidn %1, v
+ movrow m4, [dst1q] ; q0
+%if mmsize == 16 && %2 == 8
+ movhps m4, [dst8q]
+%endif
+%elifdef m8
+ SWAP 4, 8
+%else
+ mova m4, m_q0backup
+%endif
+ mova m1, m4
+ SWAP 1, 4
+ mova m7, m5
+ SWAP 7, 5
+ psubusb m1, m5 ; q0-q1
+ psubusb m7, m4 ; q1-q0
+ por m1, m7 ; abs(q1-q0)
+%if notcpuflag(mmxext)
+ mova m7, m1
+ psubusb m1, m6
+ psubusb m7, m_hevthr
+ pxor m6, m6
+ pcmpeqb m1, m6 ; abs(q1-q0) <= I
+ pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh
+ mova m6, m_maskres
+ pand m0, m1 ; abs([pq][321]-[pq][210]) <= I
+ pand m6, m7
+%else ; mmxext/sse2
+ pxor m7, m7
+ pmaxub m0, m1
+ pmaxub m6, m1
+ psubusb m0, m_flimI
+ psubusb m6, m_hevthr
+ pcmpeqb m0, m7 ; max(abs(..)) <= I
+ pcmpeqb m6, m7 ; !(max(abs..) > thresh)
+%endif
+%ifdef m12
+ SWAP 6, 12
+%else
+ mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
+%endif
+
+ ; simple_limit
+ mova m1, m3
+ SWAP 1, 3
+ mova m6, m4 ; keep copies of p0/q0 around for later use
+ SWAP 6, 4
+ psubusb m1, m4 ; p0-q0
+ psubusb m6, m3 ; q0-p0
+ por m1, m6 ; abs(q0-p0)
+ paddusb m1, m1 ; m1=2*abs(q0-p0)
+
+ mova m7, m2
+ SWAP 7, 2
+ mova m6, m5
+ SWAP 6, 5
+ psubusb m7, m5 ; p1-q1
+ psubusb m6, m2 ; q1-p1
+ por m7, m6 ; abs(q1-p1)
+ pxor m6, m6
+ pand m7, [pb_FE]
+ psrlq m7, 1 ; abs(q1-p1)/2
+ paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2
+ psubusb m7, m_flimE
+ pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
+ pand m0, m7 ; normal_limit result
+
+ ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
+%ifdef m8 ; x86-64 && sse2
+ mova m8, [pb_80]
+%define m_pb_80 m8
+%else ; x86-32 or mmx/mmxext
+%define m_pb_80 [pb_80]
+%endif
+ mova m1, m4
+ mova m7, m3
+ pxor m1, m_pb_80
+ pxor m7, m_pb_80
+ psubsb m1, m7 ; (signed) q0-p0
+ mova m6, m2
+ mova m7, m5
+ pxor m6, m_pb_80
+ pxor m7, m_pb_80
+ psubsb m6, m7 ; (signed) p1-q1
+ mova m7, m_maskres
+ paddsb m6, m1
+ paddsb m6, m1
+ paddsb m6, m1
+ pand m6, m0
+%ifdef m8
+ mova m_limres, m6 ; 3*(qp-p0)+(p1-q1) masked for filter_mbedge
+ pand m_limres, m7
+%else
+ mova m0, m6
+ pand m0, m7
+ mova m_limres, m0
+%endif
+ pandn m7, m6 ; 3*(q0-p0)+(p1-q1) masked for filter_common
+
+ mova m1, [pb_F8]
+ mova m6, m7
+ paddsb m7, [pb_3]
+ paddsb m6, [pb_4]
+ pand m7, m1
+ pand m6, m1
+
+ pxor m1, m1
+ pxor m0, m0
+ pcmpgtb m1, m7
+ psubb m0, m7
+ psrlq m7, 3 ; +f2
+ psrlq m0, 3 ; -f2
+ pand m0, m1
+ pandn m1, m7
+ psubusb m3, m0
+ paddusb m3, m1 ; p0+f2
+
+ pxor m1, m1
+ pxor m0, m0
+ pcmpgtb m0, m6
+ psubb m1, m6
+ psrlq m6, 3 ; +f1
+ psrlq m1, 3 ; -f1
+ pand m1, m0
+ pandn m0, m6
+ psubusb m4, m0
+ paddusb m4, m1 ; q0-f1
+
+ ; filter_mbedge (m2-m5 = p1-q1; lim_res carries w)
+%if cpuflag(ssse3)
+ mova m7, [pb_1]
+%else
+ mova m7, [pw_63]
+%endif
+%ifdef m8
+ SWAP 1, 8
+%else
+ mova m1, m_limres
+%endif
+ pxor m0, m0
+ mova m6, m1
+ pcmpgtb m0, m1 ; which are negative
+%if cpuflag(ssse3)
+ punpcklbw m6, m7 ; interleave with "1" for rounding
+ punpckhbw m1, m7
+%else
+ punpcklbw m6, m0 ; signed byte->word
+ punpckhbw m1, m0
+%endif
+ mova m_limsign, m0
+%if cpuflag(ssse3)
+ mova m7, [pb_27_63]
+%ifndef m8
+ mova m_limres, m1
+%endif
+%ifdef m10
+ SWAP 0, 10 ; don't lose lim_sign copy
+%endif
+ mova m0, m7
+ pmaddubsw m7, m6
+ SWAP 6, 7
+ pmaddubsw m0, m1
+ SWAP 1, 0
+%ifdef m10
+ SWAP 0, 10
+%else
+ mova m0, m_limsign
+%endif
+%else
+ mova m_maskres, m6 ; backup for later in filter
+ mova m_limres, m1
+ pmullw m6, [pw_27]
+ pmullw m1, [pw_27]
+ paddw m6, m7
+ paddw m1, m7
+%endif
+ psraw m6, 7
+ psraw m1, 7
+ packsswb m6, m1 ; a0
+ pxor m1, m1
+ psubb m1, m6
+ pand m1, m0 ; -a0
+ pandn m0, m6 ; +a0
+%if cpuflag(ssse3)
+ mova m6, [pb_18_63] ; pipelining
+%endif
+ psubusb m3, m1
+ paddusb m4, m1
+ paddusb m3, m0 ; p0+a0
+ psubusb m4, m0 ; q0-a0
+
+%if cpuflag(ssse3)
+ SWAP 6, 7
+%ifdef m10
+ SWAP 1, 10
+%else
+ mova m1, m_limres
+%endif
+ mova m0, m7
+ pmaddubsw m7, m6
+ SWAP 6, 7
+ pmaddubsw m0, m1
+ SWAP 1, 0
+%ifdef m10
+ SWAP 0, 10
+%endif
+ mova m0, m_limsign
+%else
+ mova m6, m_maskres
+ mova m1, m_limres
+ pmullw m6, [pw_18]
+ pmullw m1, [pw_18]
+ paddw m6, m7
+ paddw m1, m7
+%endif
+ mova m0, m_limsign
+ psraw m6, 7
+ psraw m1, 7
+ packsswb m6, m1 ; a1
+ pxor m1, m1
+ psubb m1, m6
+ pand m1, m0 ; -a1
+ pandn m0, m6 ; +a1
+%if cpuflag(ssse3)
+ mova m6, [pb_9_63]
+%endif
+ psubusb m2, m1
+ paddusb m5, m1
+ paddusb m2, m0 ; p1+a1
+ psubusb m5, m0 ; q1-a1
+
+%if cpuflag(ssse3)
+ SWAP 6, 7
+%ifdef m10
+ SWAP 1, 10
+%else
+ mova m1, m_limres
+%endif
+ mova m0, m7
+ pmaddubsw m7, m6
+ SWAP 6, 7
+ pmaddubsw m0, m1
+ SWAP 1, 0
+%else
+%ifdef m8
+ SWAP 6, 12
+ SWAP 1, 8
+%else
+ mova m6, m_maskres
+ mova m1, m_limres
+%endif
+ pmullw m6, [pw_9]
+ pmullw m1, [pw_9]
+ paddw m6, m7
+ paddw m1, m7
+%endif
+%ifdef m9
+ SWAP 7, 9
+%else
+ mova m7, m_limsign
+%endif
+ psraw m6, 7
+ psraw m1, 7
+ packsswb m6, m1 ; a1
+ pxor m0, m0
+ psubb m0, m6
+ pand m0, m7 ; -a1
+ pandn m7, m6 ; +a1
+%ifdef m8
+ SWAP 1, 13
+ SWAP 6, 14
+%else
+ mova m1, m_p2backup
+ mova m6, m_q2backup
+%endif
+ psubusb m1, m0
+ paddusb m6, m0
+ paddusb m1, m7 ; p1+a1
+ psubusb m6, m7 ; q1-a1
+
+ ; store
+%ifidn %1, v
+ movrow [dst2q+mstrideq*4], m1
+ movrow [dst1q+mstrideq*2], m2
+ movrow [dst1q+mstrideq ], m3
+ movrow [dst1q], m4
+ movrow [dst2q], m5
+ movrow [dst2q+ strideq ], m6
+%if mmsize == 16 && %2 == 8
+ add dst8q, mstrideq
+ movhps [dst8q+mstrideq*2], m1
+ movhps [dst8q+mstrideq ], m2
+ movhps [dst8q], m3
+ add dst8q, strideq
+ movhps [dst8q], m4
+ movhps [dst8q+ strideq ], m5
+ movhps [dst8q+ strideq*2], m6
+%endif
+%else ; h
+ inc dst1q
+ inc dst2q
+
+ ; 4x8/16 transpose
+ TRANSPOSE4x4B 1, 2, 3, 4, 0
+ SBUTTERFLY bw, 5, 6, 0
+
+%if mmsize == 8 ; mmx/mmxext (h)
+ WRITE_4x2D 1, 2, 3, 4, dst1q, dst2q, mstrideq, strideq
+ add dst1q, 4
+ WRITE_2x4W m5, m6, dst2q, dst1q, mstrideq, strideq
+%else ; sse2 (h)
+ lea dst8q, [dst8q+mstrideq+1]
+ WRITE_4x4D 1, 2, 3, 4, dst1q, dst2q, dst8q, mstrideq, strideq, %2
+ lea dst1q, [dst2q+mstrideq+4]
+ lea dst8q, [dst8q+mstrideq+4]
+%if cpuflag(sse4)
+ add dst2q, 4
+%endif
+ WRITE_8W m5, dst2q, dst1q, mstrideq, strideq
+%if cpuflag(sse4)
+ lea dst2q, [dst8q+ strideq ]
+%endif
+ WRITE_8W m6, dst2q, dst8q, mstrideq, strideq
+%endif
+%endif
+
+%if mmsize == 8
+%if %2 == 8 ; chroma
+%ifidn %1, h
+ sub dst1q, 5
+%endif
+ cmp dst1q, dst8q
+ mov dst1q, dst8q
+ jnz .next8px
+%else
+%ifidn %1, h
+ lea dst1q, [dst1q+ strideq*8-5]
+%else ; v
+ add dst1q, 8
+%endif
+ dec cntrq
+ jg .next8px
+%endif
+ REP_RET
+%else ; mmsize == 16
+ RET
+%endif
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+MBEDGE_LOOPFILTER v, 16
+MBEDGE_LOOPFILTER h, 16
+MBEDGE_LOOPFILTER v, 8
+MBEDGE_LOOPFILTER h, 8
+
+INIT_MMX mmxext
+MBEDGE_LOOPFILTER v, 16
+MBEDGE_LOOPFILTER h, 16
+MBEDGE_LOOPFILTER v, 8
+MBEDGE_LOOPFILTER h, 8
+%endif
+
+INIT_XMM sse2
+MBEDGE_LOOPFILTER v, 16
+MBEDGE_LOOPFILTER h, 16
+MBEDGE_LOOPFILTER v, 8
+MBEDGE_LOOPFILTER h, 8
+
+INIT_XMM ssse3
+MBEDGE_LOOPFILTER v, 16
+MBEDGE_LOOPFILTER h, 16
+MBEDGE_LOOPFILTER v, 8
+MBEDGE_LOOPFILTER h, 8
+
+INIT_XMM sse4
+MBEDGE_LOOPFILTER h, 16
+MBEDGE_LOOPFILTER h, 8
diff --git a/libs/ffvpx/libavcodec/x86/vp9dsp_init.c b/libs/ffvpx/libavcodec/x86/vp9dsp_init.c
new file mode 100644
index 000000000..837cce850
--- /dev/null
+++ b/libs/ffvpx/libavcodec/x86/vp9dsp_init.c
@@ -0,0 +1,416 @@
+/*
+ * VP9 SIMD optimizations
+ *
+ * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/vp9dsp.h"
+#include "libavcodec/x86/vp9dsp_init.h"
+
+#if HAVE_X86ASM
+
+decl_fpel_func(put, 4, , mmx);
+decl_fpel_func(put, 8, , mmx);
+decl_fpel_func(put, 16, , sse);
+decl_fpel_func(put, 32, , sse);
+decl_fpel_func(put, 64, , sse);
+decl_fpel_func(avg, 4, _8, mmxext);
+decl_fpel_func(avg, 8, _8, mmxext);
+decl_fpel_func(avg, 16, _8, sse2);
+decl_fpel_func(avg, 32, _8, sse2);
+decl_fpel_func(avg, 64, _8, sse2);
+decl_fpel_func(put, 32, , avx);
+decl_fpel_func(put, 64, , avx);
+decl_fpel_func(avg, 32, _8, avx2);
+decl_fpel_func(avg, 64, _8, avx2);
+
+decl_mc_funcs(4, mmxext, int16_t, 8, 8);
+decl_mc_funcs(8, sse2, int16_t, 8, 8);
+decl_mc_funcs(4, ssse3, int8_t, 32, 8);
+decl_mc_funcs(8, ssse3, int8_t, 32, 8);
+#if ARCH_X86_64
+decl_mc_funcs(16, ssse3, int8_t, 32, 8);
+decl_mc_funcs(32, avx2, int8_t, 32, 8);
+#endif
+
+mc_rep_funcs(16, 8, 8, sse2, int16_t, 8, 8)
+#if ARCH_X86_32
+mc_rep_funcs(16, 8, 8, ssse3, int8_t, 32, 8)
+#endif
+mc_rep_funcs(32, 16, 16, sse2, int16_t, 8, 8)
+mc_rep_funcs(32, 16, 16, ssse3, int8_t, 32, 8)
+mc_rep_funcs(64, 32, 32, sse2, int16_t, 8, 8)
+mc_rep_funcs(64, 32, 32, ssse3, int8_t, 32, 8)
+#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
+mc_rep_funcs(64, 32, 32, avx2, int8_t, 32, 8)
+#endif
+
+extern const int8_t ff_filters_ssse3[3][15][4][32];
+extern const int16_t ff_filters_sse2[3][15][8][8];
+
+filters_8tap_2d_fn2(put, 16, 8, 1, mmxext, sse2, sse2)
+filters_8tap_2d_fn2(avg, 16, 8, 1, mmxext, sse2, sse2)
+filters_8tap_2d_fn2(put, 16, 8, 1, ssse3, ssse3, ssse3)
+filters_8tap_2d_fn2(avg, 16, 8, 1, ssse3, ssse3, ssse3)
+#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
+filters_8tap_2d_fn(put, 64, 32, 8, 1, avx2, ssse3)
+filters_8tap_2d_fn(put, 32, 32, 8, 1, avx2, ssse3)
+filters_8tap_2d_fn(avg, 64, 32, 8, 1, avx2, ssse3)
+filters_8tap_2d_fn(avg, 32, 32, 8, 1, avx2, ssse3)
+#endif
+
+filters_8tap_1d_fn3(put, 8, mmxext, sse2, sse2)
+filters_8tap_1d_fn3(avg, 8, mmxext, sse2, sse2)
+filters_8tap_1d_fn3(put, 8, ssse3, ssse3, ssse3)
+filters_8tap_1d_fn3(avg, 8, ssse3, ssse3, ssse3)
+#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
+filters_8tap_1d_fn2(put, 64, 8, avx2, ssse3)
+filters_8tap_1d_fn2(put, 32, 8, avx2, ssse3)
+filters_8tap_1d_fn2(avg, 64, 8, avx2, ssse3)
+filters_8tap_1d_fn2(avg, 32, 8, avx2, ssse3)
+#endif
+
+#define itxfm_func(typea, typeb, size, opt) \
+void ff_vp9_##typea##_##typeb##_##size##x##size##_add_##opt(uint8_t *dst, ptrdiff_t stride, \
+ int16_t *block, int eob)
+#define itxfm_funcs(size, opt) \
+itxfm_func(idct, idct, size, opt); \
+itxfm_func(iadst, idct, size, opt); \
+itxfm_func(idct, iadst, size, opt); \
+itxfm_func(iadst, iadst, size, opt)
+
+itxfm_func(idct, idct, 4, mmxext);
+itxfm_func(idct, iadst, 4, sse2);
+itxfm_func(iadst, idct, 4, sse2);
+itxfm_func(iadst, iadst, 4, sse2);
+itxfm_funcs(4, ssse3);
+itxfm_funcs(8, sse2);
+itxfm_funcs(8, ssse3);
+itxfm_funcs(8, avx);
+itxfm_funcs(16, sse2);
+itxfm_funcs(16, ssse3);
+itxfm_funcs(16, avx);
+itxfm_func(idct, idct, 32, sse2);
+itxfm_func(idct, idct, 32, ssse3);
+itxfm_func(idct, idct, 32, avx);
+itxfm_func(iwht, iwht, 4, mmx);
+itxfm_funcs(16, avx2);
+itxfm_func(idct, idct, 32, avx2);
+
+#undef itxfm_func
+#undef itxfm_funcs
+
+#define lpf_funcs(size1, size2, opt) \
+void ff_vp9_loop_filter_v_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \
+ int E, int I, int H); \
+void ff_vp9_loop_filter_h_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \
+ int E, int I, int H)
+
+lpf_funcs(4, 8, mmxext);
+lpf_funcs(8, 8, mmxext);
+lpf_funcs(16, 16, sse2);
+lpf_funcs(16, 16, ssse3);
+lpf_funcs(16, 16, avx);
+lpf_funcs(44, 16, sse2);
+lpf_funcs(44, 16, ssse3);
+lpf_funcs(44, 16, avx);
+lpf_funcs(84, 16, sse2);
+lpf_funcs(84, 16, ssse3);
+lpf_funcs(84, 16, avx);
+lpf_funcs(48, 16, sse2);
+lpf_funcs(48, 16, ssse3);
+lpf_funcs(48, 16, avx);
+lpf_funcs(88, 16, sse2);
+lpf_funcs(88, 16, ssse3);
+lpf_funcs(88, 16, avx);
+
+#undef lpf_funcs
+
+#define ipred_func(size, type, opt) \
+void ff_vp9_ipred_##type##_##size##x##size##_##opt(uint8_t *dst, ptrdiff_t stride, \
+ const uint8_t *l, const uint8_t *a)
+
+ipred_func(8, v, mmx);
+
+#define ipred_dc_funcs(size, opt) \
+ipred_func(size, dc, opt); \
+ipred_func(size, dc_left, opt); \
+ipred_func(size, dc_top, opt)
+
+ipred_dc_funcs(4, mmxext);
+ipred_dc_funcs(8, mmxext);
+
+#define ipred_dir_tm_funcs(size, opt) \
+ipred_func(size, tm, opt); \
+ipred_func(size, dl, opt); \
+ipred_func(size, dr, opt); \
+ipred_func(size, hd, opt); \
+ipred_func(size, hu, opt); \
+ipred_func(size, vl, opt); \
+ipred_func(size, vr, opt)
+
+ipred_dir_tm_funcs(4, mmxext);
+
+ipred_func(16, v, sse);
+ipred_func(32, v, sse);
+
+ipred_dc_funcs(16, sse2);
+ipred_dc_funcs(32, sse2);
+
+#define ipred_dir_tm_h_funcs(size, opt) \
+ipred_dir_tm_funcs(size, opt); \
+ipred_func(size, h, opt)
+
+ipred_dir_tm_h_funcs(8, sse2);
+ipred_dir_tm_h_funcs(16, sse2);
+ipred_dir_tm_h_funcs(32, sse2);
+
+ipred_func(4, h, sse2);
+
+#define ipred_all_funcs(size, opt) \
+ipred_dc_funcs(size, opt); \
+ipred_dir_tm_h_funcs(size, opt)
+
+// FIXME hd/vl_4x4_ssse3 does not exist
+ipred_all_funcs(4, ssse3);
+ipred_all_funcs(8, ssse3);
+ipred_all_funcs(16, ssse3);
+ipred_all_funcs(32, ssse3);
+
+ipred_dir_tm_h_funcs(8, avx);
+ipred_dir_tm_h_funcs(16, avx);
+ipred_dir_tm_h_funcs(32, avx);
+
+ipred_func(32, v, avx);
+
+ipred_dc_funcs(32, avx2);
+ipred_func(32, h, avx2);
+ipred_func(32, tm, avx2);
+
+#undef ipred_func
+#undef ipred_dir_tm_h_funcs
+#undef ipred_dir_tm_funcs
+#undef ipred_dc_funcs
+
+#endif /* HAVE_X86ASM */
+
+av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
+{
+#if HAVE_X86ASM
+ int cpu_flags;
+
+ if (bpp == 10) {
+ ff_vp9dsp_init_10bpp_x86(dsp, bitexact);
+ return;
+ } else if (bpp == 12) {
+ ff_vp9dsp_init_12bpp_x86(dsp, bitexact);
+ return;
+ }
+
+ cpu_flags = av_get_cpu_flags();
+
+#define init_lpf(opt) do { \
+ dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_##opt; \
+ dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_##opt; \
+ dsp->loop_filter_mix2[0][0][0] = ff_vp9_loop_filter_h_44_16_##opt; \
+ dsp->loop_filter_mix2[0][0][1] = ff_vp9_loop_filter_v_44_16_##opt; \
+ dsp->loop_filter_mix2[0][1][0] = ff_vp9_loop_filter_h_48_16_##opt; \
+ dsp->loop_filter_mix2[0][1][1] = ff_vp9_loop_filter_v_48_16_##opt; \
+ dsp->loop_filter_mix2[1][0][0] = ff_vp9_loop_filter_h_84_16_##opt; \
+ dsp->loop_filter_mix2[1][0][1] = ff_vp9_loop_filter_v_84_16_##opt; \
+ dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_##opt; \
+ dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_##opt; \
+} while (0)
+
+#define init_ipred(sz, opt, t, e) \
+ dsp->intra_pred[TX_##sz##X##sz][e##_PRED] = ff_vp9_ipred_##t##_##sz##x##sz##_##opt
+
+#define ff_vp9_ipred_hd_4x4_ssse3 ff_vp9_ipred_hd_4x4_mmxext
+#define ff_vp9_ipred_vl_4x4_ssse3 ff_vp9_ipred_vl_4x4_mmxext
+#define init_dir_tm_ipred(sz, opt) do { \
+ init_ipred(sz, opt, dl, DIAG_DOWN_LEFT); \
+ init_ipred(sz, opt, dr, DIAG_DOWN_RIGHT); \
+ init_ipred(sz, opt, hd, HOR_DOWN); \
+ init_ipred(sz, opt, vl, VERT_LEFT); \
+ init_ipred(sz, opt, hu, HOR_UP); \
+ init_ipred(sz, opt, tm, TM_VP8); \
+ init_ipred(sz, opt, vr, VERT_RIGHT); \
+} while (0)
+#define init_dir_tm_h_ipred(sz, opt) do { \
+ init_dir_tm_ipred(sz, opt); \
+ init_ipred(sz, opt, h, HOR); \
+} while (0)
+#define init_dc_ipred(sz, opt) do { \
+ init_ipred(sz, opt, dc, DC); \
+ init_ipred(sz, opt, dc_left, LEFT_DC); \
+ init_ipred(sz, opt, dc_top, TOP_DC); \
+} while (0)
+#define init_all_ipred(sz, opt) do { \
+ init_dc_ipred(sz, opt); \
+ init_dir_tm_h_ipred(sz, opt); \
+} while (0)
+
+ if (EXTERNAL_MMX(cpu_flags)) {
+ init_fpel_func(4, 0, 4, put, , mmx);
+ init_fpel_func(3, 0, 8, put, , mmx);
+ if (!bitexact) {
+ dsp->itxfm_add[4 /* lossless */][DCT_DCT] =
+ dsp->itxfm_add[4 /* lossless */][ADST_DCT] =
+ dsp->itxfm_add[4 /* lossless */][DCT_ADST] =
+ dsp->itxfm_add[4 /* lossless */][ADST_ADST] = ff_vp9_iwht_iwht_4x4_add_mmx;
+ }
+ init_ipred(8, mmx, v, VERT);
+ }
+
+ if (EXTERNAL_MMXEXT(cpu_flags)) {
+ dsp->loop_filter_8[0][0] = ff_vp9_loop_filter_h_4_8_mmxext;
+ dsp->loop_filter_8[0][1] = ff_vp9_loop_filter_v_4_8_mmxext;
+ dsp->loop_filter_8[1][0] = ff_vp9_loop_filter_h_8_8_mmxext;
+ dsp->loop_filter_8[1][1] = ff_vp9_loop_filter_v_8_8_mmxext;
+ init_subpel2(4, 0, 4, put, 8, mmxext);
+ init_subpel2(4, 1, 4, avg, 8, mmxext);
+ init_fpel_func(4, 1, 4, avg, _8, mmxext);
+ init_fpel_func(3, 1, 8, avg, _8, mmxext);
+ dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_mmxext;
+ init_dc_ipred(4, mmxext);
+ init_dc_ipred(8, mmxext);
+ init_dir_tm_ipred(4, mmxext);
+ }
+
+ if (EXTERNAL_SSE(cpu_flags)) {
+ init_fpel_func(2, 0, 16, put, , sse);
+ init_fpel_func(1, 0, 32, put, , sse);
+ init_fpel_func(0, 0, 64, put, , sse);
+ init_ipred(16, sse, v, VERT);
+ init_ipred(32, sse, v, VERT);
+ }
+
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ init_subpel3_8to64(0, put, 8, sse2);
+ init_subpel3_8to64(1, avg, 8, sse2);
+ init_fpel_func(2, 1, 16, avg, _8, sse2);
+ init_fpel_func(1, 1, 32, avg, _8, sse2);
+ init_fpel_func(0, 1, 64, avg, _8, sse2);
+ init_lpf(sse2);
+ dsp->itxfm_add[TX_4X4][ADST_DCT] = ff_vp9_idct_iadst_4x4_add_sse2;
+ dsp->itxfm_add[TX_4X4][DCT_ADST] = ff_vp9_iadst_idct_4x4_add_sse2;
+ dsp->itxfm_add[TX_4X4][ADST_ADST] = ff_vp9_iadst_iadst_4x4_add_sse2;
+ dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_sse2;
+ dsp->itxfm_add[TX_8X8][ADST_DCT] = ff_vp9_idct_iadst_8x8_add_sse2;
+ dsp->itxfm_add[TX_8X8][DCT_ADST] = ff_vp9_iadst_idct_8x8_add_sse2;
+ dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_sse2;
+ dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_sse2;
+ dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_sse2;
+ dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_sse2;
+ dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_sse2;
+ dsp->itxfm_add[TX_32X32][ADST_ADST] =
+ dsp->itxfm_add[TX_32X32][ADST_DCT] =
+ dsp->itxfm_add[TX_32X32][DCT_ADST] =
+ dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_sse2;
+ init_dc_ipred(16, sse2);
+ init_dc_ipred(32, sse2);
+ init_dir_tm_h_ipred(8, sse2);
+ init_dir_tm_h_ipred(16, sse2);
+ init_dir_tm_h_ipred(32, sse2);
+ init_ipred(4, sse2, h, HOR);
+ }
+
+ if (EXTERNAL_SSSE3(cpu_flags)) {
+ init_subpel3(0, put, 8, ssse3);
+ init_subpel3(1, avg, 8, ssse3);
+ dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_ssse3;
+ dsp->itxfm_add[TX_4X4][ADST_DCT] = ff_vp9_idct_iadst_4x4_add_ssse3;
+ dsp->itxfm_add[TX_4X4][DCT_ADST] = ff_vp9_iadst_idct_4x4_add_ssse3;
+ dsp->itxfm_add[TX_4X4][ADST_ADST] = ff_vp9_iadst_iadst_4x4_add_ssse3;
+ dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_ssse3;
+ dsp->itxfm_add[TX_8X8][ADST_DCT] = ff_vp9_idct_iadst_8x8_add_ssse3;
+ dsp->itxfm_add[TX_8X8][DCT_ADST] = ff_vp9_iadst_idct_8x8_add_ssse3;
+ dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_ssse3;
+ dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_ssse3;
+ dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_ssse3;
+ dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_ssse3;
+ dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_ssse3;
+ dsp->itxfm_add[TX_32X32][ADST_ADST] =
+ dsp->itxfm_add[TX_32X32][ADST_DCT] =
+ dsp->itxfm_add[TX_32X32][DCT_ADST] =
+ dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_ssse3;
+ init_lpf(ssse3);
+ init_all_ipred(4, ssse3);
+ init_all_ipred(8, ssse3);
+ init_all_ipred(16, ssse3);
+ init_all_ipred(32, ssse3);
+ }
+
+ if (EXTERNAL_AVX(cpu_flags)) {
+ dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_avx;
+ dsp->itxfm_add[TX_8X8][ADST_DCT] = ff_vp9_idct_iadst_8x8_add_avx;
+ dsp->itxfm_add[TX_8X8][DCT_ADST] = ff_vp9_iadst_idct_8x8_add_avx;
+ dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_avx;
+ dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx;
+ dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_avx;
+ dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_avx;
+ dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx;
+ dsp->itxfm_add[TX_32X32][ADST_ADST] =
+ dsp->itxfm_add[TX_32X32][ADST_DCT] =
+ dsp->itxfm_add[TX_32X32][DCT_ADST] =
+ dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx;
+ init_lpf(avx);
+ init_dir_tm_h_ipred(8, avx);
+ init_dir_tm_h_ipred(16, avx);
+ init_dir_tm_h_ipred(32, avx);
+ }
+ if (EXTERNAL_AVX_FAST(cpu_flags)) {
+ init_fpel_func(1, 0, 32, put, , avx);
+ init_fpel_func(0, 0, 64, put, , avx);
+ init_ipred(32, avx, v, VERT);
+ }
+
+ if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+ init_fpel_func(1, 1, 32, avg, _8, avx2);
+ init_fpel_func(0, 1, 64, avg, _8, avx2);
+ if (ARCH_X86_64) {
+#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
+ dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx2;
+ dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_avx2;
+ dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_avx2;
+ dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx2;
+ dsp->itxfm_add[TX_32X32][ADST_ADST] =
+ dsp->itxfm_add[TX_32X32][ADST_DCT] =
+ dsp->itxfm_add[TX_32X32][DCT_ADST] =
+ dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx2;
+ init_subpel3_32_64(0, put, 8, avx2);
+ init_subpel3_32_64(1, avg, 8, avx2);
+#endif
+ }
+ init_dc_ipred(32, avx2);
+ init_ipred(32, avx2, h, HOR);
+ init_ipred(32, avx2, tm, TM_VP8);
+ }
+
+#undef init_fpel
+#undef init_subpel1
+#undef init_subpel2
+#undef init_subpel3
+
+#endif /* HAVE_X86ASM */
+}
diff --git a/libs/ffvpx/libavcodec/x86/vp9dsp_init.h b/libs/ffvpx/libavcodec/x86/vp9dsp_init.h
new file mode 100644
index 000000000..e410cab3a
--- /dev/null
+++ b/libs/ffvpx/libavcodec/x86/vp9dsp_init.h
@@ -0,0 +1,189 @@
+/*
+ * VP9 SIMD optimizations
+ *
+ * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_VP9DSP_INIT_H
+#define AVCODEC_X86_VP9DSP_INIT_H
+
+#include "libavcodec/vp9dsp.h"
+
+// hack to force-expand BPC
+#define cat(a, bpp, b) a##bpp##b
+
+#define decl_fpel_func(avg, sz, bpp, opt) \
+void ff_vp9_##avg##sz##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+ const uint8_t *src, ptrdiff_t src_stride, \
+ int h, int mx, int my)
+
+#define decl_mc_func(avg, sz, dir, opt, type, f_sz, bpp) \
+void ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+ const uint8_t *src, ptrdiff_t src_stride, \
+ int h, const type (*filter)[f_sz])
+
+#define decl_mc_funcs(sz, opt, type, fsz, bpp) \
+decl_mc_func(put, sz, h, opt, type, fsz, bpp); \
+decl_mc_func(avg, sz, h, opt, type, fsz, bpp); \
+decl_mc_func(put, sz, v, opt, type, fsz, bpp); \
+decl_mc_func(avg, sz, v, opt, type, fsz, bpp)
+
+#define decl_ipred_fn(type, sz, bpp, opt) \
+void ff_vp9_ipred_##type##_##sz##x##sz##_##bpp##_##opt(uint8_t *dst, \
+ ptrdiff_t stride, \
+ const uint8_t *l, \
+ const uint8_t *a)
+
+#define decl_ipred_fns(type, bpp, opt4, opt8_16_32) \
+decl_ipred_fn(type, 4, bpp, opt4); \
+decl_ipred_fn(type, 8, bpp, opt8_16_32); \
+decl_ipred_fn(type, 16, bpp, opt8_16_32); \
+decl_ipred_fn(type, 32, bpp, opt8_16_32)
+
+#define decl_itxfm_func(typea, typeb, size, bpp, opt) \
+void cat(ff_vp9_##typea##_##typeb##_##size##x##size##_add_, bpp, _##opt)(uint8_t *dst, \
+ ptrdiff_t stride, \
+ int16_t *block, \
+ int eob)
+
+#define decl_itxfm_funcs(size, bpp, opt) \
+decl_itxfm_func(idct, idct, size, bpp, opt); \
+decl_itxfm_func(iadst, idct, size, bpp, opt); \
+decl_itxfm_func(idct, iadst, size, bpp, opt); \
+decl_itxfm_func(iadst, iadst, size, bpp, opt)
+
+#define mc_rep_func(avg, sz, hsz, hszb, dir, opt, type, f_sz, bpp) \
+static av_always_inline void \
+ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+ const uint8_t *src, ptrdiff_t src_stride, \
+ int h, const type (*filter)[f_sz]) \
+{ \
+ ff_vp9_##avg##_8tap_1d_##dir##_##hsz##_##bpp##_##opt(dst, dst_stride, src, \
+ src_stride, h, filter); \
+ ff_vp9_##avg##_8tap_1d_##dir##_##hsz##_##bpp##_##opt(dst + hszb, dst_stride, src + hszb, \
+ src_stride, h, filter); \
+}
+
+#define mc_rep_funcs(sz, hsz, hszb, opt, type, fsz, bpp) \
+mc_rep_func(put, sz, hsz, hszb, h, opt, type, fsz, bpp) \
+mc_rep_func(avg, sz, hsz, hszb, h, opt, type, fsz, bpp) \
+mc_rep_func(put, sz, hsz, hszb, v, opt, type, fsz, bpp) \
+mc_rep_func(avg, sz, hsz, hszb, v, opt, type, fsz, bpp)
+
+#define filter_8tap_1d_fn(op, sz, f, f_opt, fname, dir, dvar, bpp, opt) \
+static void op##_8tap_##fname##_##sz##dir##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+ const uint8_t *src, ptrdiff_t src_stride, \
+ int h, int mx, int my) \
+{ \
+ ff_vp9_##op##_8tap_1d_##dir##_##sz##_##bpp##_##opt(dst, dst_stride, src, src_stride, \
+ h, ff_filters_##f_opt[f][dvar - 1]); \
+}
+
+#define filters_8tap_1d_fn(op, sz, dir, dvar, bpp, opt, f_opt) \
+filter_8tap_1d_fn(op, sz, FILTER_8TAP_REGULAR, f_opt, regular, dir, dvar, bpp, opt) \
+filter_8tap_1d_fn(op, sz, FILTER_8TAP_SHARP, f_opt, sharp, dir, dvar, bpp, opt) \
+filter_8tap_1d_fn(op, sz, FILTER_8TAP_SMOOTH, f_opt, smooth, dir, dvar, bpp, opt)
+
+#define filters_8tap_1d_fn2(op, sz, bpp, opt, f_opt) \
+filters_8tap_1d_fn(op, sz, h, mx, bpp, opt, f_opt) \
+filters_8tap_1d_fn(op, sz, v, my, bpp, opt, f_opt)
+
+#define filters_8tap_1d_fn3(op, bpp, opt4, opt8, f_opt) \
+filters_8tap_1d_fn2(op, 64, bpp, opt8, f_opt) \
+filters_8tap_1d_fn2(op, 32, bpp, opt8, f_opt) \
+filters_8tap_1d_fn2(op, 16, bpp, opt8, f_opt) \
+filters_8tap_1d_fn2(op, 8, bpp, opt8, f_opt) \
+filters_8tap_1d_fn2(op, 4, bpp, opt4, f_opt)
+
+#define filter_8tap_2d_fn(op, sz, f, f_opt, fname, align, bpp, bytes, opt) \
+static void op##_8tap_##fname##_##sz##hv_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+ const uint8_t *src, ptrdiff_t src_stride, \
+ int h, int mx, int my) \
+{ \
+ LOCAL_ALIGNED_##align(uint8_t, temp, [71 * 64 * bytes]); \
+ ff_vp9_put_8tap_1d_h_##sz##_##bpp##_##opt(temp, 64 * bytes, src - 3 * src_stride, \
+ src_stride, h + 7, \
+ ff_filters_##f_opt[f][mx - 1]); \
+ ff_vp9_##op##_8tap_1d_v_##sz##_##bpp##_##opt(dst, dst_stride, temp + 3 * bytes * 64, \
+ 64 * bytes, h, \
+ ff_filters_##f_opt[f][my - 1]); \
+}
+
+#define filters_8tap_2d_fn(op, sz, align, bpp, bytes, opt, f_opt) \
+filter_8tap_2d_fn(op, sz, FILTER_8TAP_REGULAR, f_opt, regular, align, bpp, bytes, opt) \
+filter_8tap_2d_fn(op, sz, FILTER_8TAP_SHARP, f_opt, sharp, align, bpp, bytes, opt) \
+filter_8tap_2d_fn(op, sz, FILTER_8TAP_SMOOTH, f_opt, smooth, align, bpp, bytes, opt)
+
+#define filters_8tap_2d_fn2(op, align, bpp, bytes, opt4, opt8, f_opt) \
+filters_8tap_2d_fn(op, 64, align, bpp, bytes, opt8, f_opt) \
+filters_8tap_2d_fn(op, 32, align, bpp, bytes, opt8, f_opt) \
+filters_8tap_2d_fn(op, 16, align, bpp, bytes, opt8, f_opt) \
+filters_8tap_2d_fn(op, 8, align, bpp, bytes, opt8, f_opt) \
+filters_8tap_2d_fn(op, 4, align, bpp, bytes, opt4, f_opt)
+
+#define init_fpel_func(idx1, idx2, sz, type, bpp, opt) \
+ dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
+ dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
+ dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = \
+ dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_vp9_##type##sz##bpp##_##opt
+
+#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, bpp, opt) \
+ dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = \
+ type##_8tap_smooth_##sz##dir##_##bpp##_##opt; \
+ dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = \
+ type##_8tap_regular_##sz##dir##_##bpp##_##opt; \
+ dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][idxh][idxv] = \
+ type##_8tap_sharp_##sz##dir##_##bpp##_##opt
+
+#define init_subpel2(idx1, idx2, sz, type, bpp, opt) \
+ init_subpel1(idx1, idx2, 1, 1, sz, hv, type, bpp, opt); \
+ init_subpel1(idx1, idx2, 0, 1, sz, v, type, bpp, opt); \
+ init_subpel1(idx1, idx2, 1, 0, sz, h, type, bpp, opt)
+
+#define init_subpel3_32_64(idx, type, bpp, opt) \
+ init_subpel2(0, idx, 64, type, bpp, opt); \
+ init_subpel2(1, idx, 32, type, bpp, opt)
+
+#define init_subpel3_8to64(idx, type, bpp, opt) \
+ init_subpel3_32_64(idx, type, bpp, opt); \
+ init_subpel2(2, idx, 16, type, bpp, opt); \
+ init_subpel2(3, idx, 8, type, bpp, opt)
+
+#define init_subpel3(idx, type, bpp, opt) \
+ init_subpel3_8to64(idx, type, bpp, opt); \
+ init_subpel2(4, idx, 4, type, bpp, opt)
+
+#define init_ipred_func(type, enum, sz, bpp, opt) \
+ dsp->intra_pred[TX_##sz##X##sz][enum##_PRED] = \
+ cat(ff_vp9_ipred_##type##_##sz##x##sz##_, bpp, _##opt)
+
+#define init_8_16_32_ipred_funcs(type, enum, bpp, opt) \
+ init_ipred_func(type, enum, 8, bpp, opt); \
+ init_ipred_func(type, enum, 16, bpp, opt); \
+ init_ipred_func(type, enum, 32, bpp, opt)
+
+#define init_ipred_funcs(type, enum, bpp, opt) \
+ init_ipred_func(type, enum, 4, bpp, opt); \
+ init_8_16_32_ipred_funcs(type, enum, bpp, opt)
+
+void ff_vp9dsp_init_10bpp_x86(VP9DSPContext *dsp, int bitexact);
+void ff_vp9dsp_init_12bpp_x86(VP9DSPContext *dsp, int bitexact);
+void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp);
+
+#endif /* AVCODEC_X86_VP9DSP_INIT_H */
diff --git a/libs/ffvpx/libavcodec/x86/vp9dsp_init_10bpp.c b/libs/ffvpx/libavcodec/x86/vp9dsp_init_10bpp.c
new file mode 100644
index 000000000..2694c06cb
--- /dev/null
+++ b/libs/ffvpx/libavcodec/x86/vp9dsp_init_10bpp.c
@@ -0,0 +1,25 @@
+/*
+ * VP9 SIMD optimizations
+ *
+ * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BPC 10
+#define INIT_FUNC ff_vp9dsp_init_10bpp_x86
+#include "vp9dsp_init_16bpp_template.c"
diff --git a/libs/ffvpx/libavcodec/x86/vp9dsp_init_12bpp.c b/libs/ffvpx/libavcodec/x86/vp9dsp_init_12bpp.c
new file mode 100644
index 000000000..5da3bc184
--- /dev/null
+++ b/libs/ffvpx/libavcodec/x86/vp9dsp_init_12bpp.c
@@ -0,0 +1,25 @@
+/*
+ * VP9 SIMD optimizations
+ *
+ * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BPC 12
+#define INIT_FUNC ff_vp9dsp_init_12bpp_x86
+#include "vp9dsp_init_16bpp_template.c"
diff --git a/libs/ffvpx/libavcodec/x86/vp9dsp_init_16bpp.c b/libs/ffvpx/libavcodec/x86/vp9dsp_init_16bpp.c
new file mode 100644
index 000000000..60d10a12a
--- /dev/null
+++ b/libs/ffvpx/libavcodec/x86/vp9dsp_init_16bpp.c
@@ -0,0 +1,149 @@
+/*
+ * VP9 SIMD optimizations
+ *
+ * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/vp9dsp.h"
+#include "libavcodec/x86/vp9dsp_init.h"
+
+#if HAVE_X86ASM
+
+decl_fpel_func(put, 8, , mmx);
+decl_fpel_func(avg, 8, _16, mmxext);
+decl_fpel_func(put, 16, , sse);
+decl_fpel_func(put, 32, , sse);
+decl_fpel_func(put, 64, , sse);
+decl_fpel_func(put, 128, , sse);
+decl_fpel_func(avg, 16, _16, sse2);
+decl_fpel_func(avg, 32, _16, sse2);
+decl_fpel_func(avg, 64, _16, sse2);
+decl_fpel_func(avg, 128, _16, sse2);
+decl_fpel_func(put, 32, , avx);
+decl_fpel_func(put, 64, , avx);
+decl_fpel_func(put, 128, , avx);
+decl_fpel_func(avg, 32, _16, avx2);
+decl_fpel_func(avg, 64, _16, avx2);
+decl_fpel_func(avg, 128, _16, avx2);
+
+decl_ipred_fns(v, 16, mmx, sse);
+decl_ipred_fns(h, 16, mmxext, sse2);
+decl_ipred_fns(dc, 16, mmxext, sse2);
+decl_ipred_fns(dc_top, 16, mmxext, sse2);
+decl_ipred_fns(dc_left, 16, mmxext, sse2);
+decl_ipred_fn(dl, 16, 16, avx2);
+decl_ipred_fn(dl, 32, 16, avx2);
+decl_ipred_fn(dr, 16, 16, avx2);
+decl_ipred_fn(dr, 32, 16, avx2);
+
+#define decl_ipred_dir_funcs(type) \
+decl_ipred_fns(type, 16, sse2, sse2); \
+decl_ipred_fns(type, 16, ssse3, ssse3); \
+decl_ipred_fns(type, 16, avx, avx)
+
+decl_ipred_dir_funcs(dl);
+decl_ipred_dir_funcs(dr);
+decl_ipred_dir_funcs(vl);
+decl_ipred_dir_funcs(vr);
+decl_ipred_dir_funcs(hu);
+decl_ipred_dir_funcs(hd);
+#endif /* HAVE_X86ASM */
+
+av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
+{
+#if HAVE_X86ASM
+ int cpu_flags = av_get_cpu_flags();
+
+ if (EXTERNAL_MMX(cpu_flags)) {
+ init_fpel_func(4, 0, 8, put, , mmx);
+ init_ipred_func(v, VERT, 4, 16, mmx);
+ }
+
+ if (EXTERNAL_MMXEXT(cpu_flags)) {
+ init_fpel_func(4, 1, 8, avg, _16, mmxext);
+ init_ipred_func(h, HOR, 4, 16, mmxext);
+ init_ipred_func(dc, DC, 4, 16, mmxext);
+ init_ipred_func(dc_top, TOP_DC, 4, 16, mmxext);
+ init_ipred_func(dc_left, LEFT_DC, 4, 16, mmxext);
+ }
+
+ if (EXTERNAL_SSE(cpu_flags)) {
+ init_fpel_func(3, 0, 16, put, , sse);
+ init_fpel_func(2, 0, 32, put, , sse);
+ init_fpel_func(1, 0, 64, put, , sse);
+ init_fpel_func(0, 0, 128, put, , sse);
+ init_8_16_32_ipred_funcs(v, VERT, 16, sse);
+ }
+
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ init_fpel_func(3, 1, 16, avg, _16, sse2);
+ init_fpel_func(2, 1, 32, avg, _16, sse2);
+ init_fpel_func(1, 1, 64, avg, _16, sse2);
+ init_fpel_func(0, 1, 128, avg, _16, sse2);
+ init_8_16_32_ipred_funcs(h, HOR, 16, sse2);
+ init_8_16_32_ipred_funcs(dc, DC, 16, sse2);
+ init_8_16_32_ipred_funcs(dc_top, TOP_DC, 16, sse2);
+ init_8_16_32_ipred_funcs(dc_left, LEFT_DC, 16, sse2);
+ init_ipred_funcs(dl, DIAG_DOWN_LEFT, 16, sse2);
+ init_ipred_funcs(dr, DIAG_DOWN_RIGHT, 16, sse2);
+ init_ipred_funcs(vl, VERT_LEFT, 16, sse2);
+ init_ipred_funcs(vr, VERT_RIGHT, 16, sse2);
+ init_ipred_funcs(hu, HOR_UP, 16, sse2);
+ init_ipred_funcs(hd, HOR_DOWN, 16, sse2);
+ }
+
+ if (EXTERNAL_SSSE3(cpu_flags)) {
+ init_ipred_funcs(dl, DIAG_DOWN_LEFT, 16, ssse3);
+ init_ipred_funcs(dr, DIAG_DOWN_RIGHT, 16, ssse3);
+ init_ipred_funcs(vl, VERT_LEFT, 16, ssse3);
+ init_ipred_funcs(vr, VERT_RIGHT, 16, ssse3);
+ init_ipred_funcs(hu, HOR_UP, 16, ssse3);
+ init_ipred_funcs(hd, HOR_DOWN, 16, ssse3);
+ }
+
+ if (EXTERNAL_AVX_FAST(cpu_flags)) {
+ init_fpel_func(2, 0, 32, put, , avx);
+ init_fpel_func(1, 0, 64, put, , avx);
+ init_fpel_func(0, 0, 128, put, , avx);
+ init_ipred_funcs(dl, DIAG_DOWN_LEFT, 16, avx);
+ init_ipred_funcs(dr, DIAG_DOWN_RIGHT, 16, avx);
+ init_ipred_funcs(vl, VERT_LEFT, 16, avx);
+ init_ipred_funcs(vr, VERT_RIGHT, 16, avx);
+ init_ipred_funcs(hu, HOR_UP, 16, avx);
+ init_ipred_funcs(hd, HOR_DOWN, 16, avx);
+ }
+
+ if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+ init_fpel_func(2, 1, 32, avg, _16, avx2);
+ init_fpel_func(1, 1, 64, avg, _16, avx2);
+ init_fpel_func(0, 1, 128, avg, _16, avx2);
+ init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2);
+ init_ipred_func(dl, DIAG_DOWN_LEFT, 32, 16, avx2);
+ init_ipred_func(dr, DIAG_DOWN_RIGHT, 16, 16, avx2);
+#if ARCH_X86_64
+ init_ipred_func(dr, DIAG_DOWN_RIGHT, 32, 16, avx2);
+#endif
+ }
+
+#endif /* HAVE_X86ASM */
+}
diff --git a/libs/ffvpx/libavcodec/x86/vp9dsp_init_16bpp_template.c b/libs/ffvpx/libavcodec/x86/vp9dsp_init_16bpp_template.c
new file mode 100644
index 000000000..b56afc7f5
--- /dev/null
+++ b/libs/ffvpx/libavcodec/x86/vp9dsp_init_16bpp_template.c
@@ -0,0 +1,240 @@
+/*
+ * VP9 SIMD optimizations
+ *
+ * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/vp9dsp.h"
+#include "libavcodec/x86/vp9dsp_init.h"
+
+#if HAVE_X86ASM
+
+extern const int16_t ff_filters_16bpp[3][15][4][16];
+
+decl_mc_funcs(4, sse2, int16_t, 16, BPC);
+decl_mc_funcs(8, sse2, int16_t, 16, BPC);
+decl_mc_funcs(16, avx2, int16_t, 16, BPC);
+
+mc_rep_funcs(16, 8, 16, sse2, int16_t, 16, BPC)
+mc_rep_funcs(32, 16, 32, sse2, int16_t, 16, BPC)
+mc_rep_funcs(64, 32, 64, sse2, int16_t, 16, BPC)
+#if HAVE_AVX2_EXTERNAL
+mc_rep_funcs(32, 16, 32, avx2, int16_t, 16, BPC)
+mc_rep_funcs(64, 32, 64, avx2, int16_t, 16, BPC)
+#endif
+
+filters_8tap_2d_fn2(put, 16, BPC, 2, sse2, sse2, 16bpp)
+filters_8tap_2d_fn2(avg, 16, BPC, 2, sse2, sse2, 16bpp)
+#if HAVE_AVX2_EXTERNAL
+filters_8tap_2d_fn(put, 64, 32, BPC, 2, avx2, 16bpp)
+filters_8tap_2d_fn(avg, 64, 32, BPC, 2, avx2, 16bpp)
+filters_8tap_2d_fn(put, 32, 32, BPC, 2, avx2, 16bpp)
+filters_8tap_2d_fn(avg, 32, 32, BPC, 2, avx2, 16bpp)
+filters_8tap_2d_fn(put, 16, 32, BPC, 2, avx2, 16bpp)
+filters_8tap_2d_fn(avg, 16, 32, BPC, 2, avx2, 16bpp)
+#endif
+
+filters_8tap_1d_fn3(put, BPC, sse2, sse2, 16bpp)
+filters_8tap_1d_fn3(avg, BPC, sse2, sse2, 16bpp)
+#if HAVE_AVX2_EXTERNAL
+filters_8tap_1d_fn2(put, 64, BPC, avx2, 16bpp)
+filters_8tap_1d_fn2(avg, 64, BPC, avx2, 16bpp)
+filters_8tap_1d_fn2(put, 32, BPC, avx2, 16bpp)
+filters_8tap_1d_fn2(avg, 32, BPC, avx2, 16bpp)
+filters_8tap_1d_fn2(put, 16, BPC, avx2, 16bpp)
+filters_8tap_1d_fn2(avg, 16, BPC, avx2, 16bpp)
+#endif
+
+#define decl_lpf_func(dir, wd, bpp, opt) \
+void ff_vp9_loop_filter_##dir##_##wd##_##bpp##_##opt(uint8_t *dst, ptrdiff_t stride, \
+ int E, int I, int H)
+
+#define decl_lpf_funcs(dir, wd, bpp) \
+decl_lpf_func(dir, wd, bpp, sse2); \
+decl_lpf_func(dir, wd, bpp, ssse3); \
+decl_lpf_func(dir, wd, bpp, avx)
+
+#define decl_lpf_funcs_wd(dir) \
+decl_lpf_funcs(dir, 4, BPC); \
+decl_lpf_funcs(dir, 8, BPC); \
+decl_lpf_funcs(dir, 16, BPC)
+
+decl_lpf_funcs_wd(h);
+decl_lpf_funcs_wd(v);
+
+#define lpf_16_wrapper(dir, off, bpp, opt) \
+static void loop_filter_##dir##_16_##bpp##_##opt(uint8_t *dst, ptrdiff_t stride, \
+ int E, int I, int H) \
+{ \
+ ff_vp9_loop_filter_##dir##_16_##bpp##_##opt(dst, stride, E, I, H); \
+ ff_vp9_loop_filter_##dir##_16_##bpp##_##opt(dst + off, stride, E, I, H); \
+}
+
+#define lpf_16_wrappers(bpp, opt) \
+lpf_16_wrapper(h, 8 * stride, bpp, opt) \
+lpf_16_wrapper(v, 16, bpp, opt)
+
+lpf_16_wrappers(BPC, sse2)
+lpf_16_wrappers(BPC, ssse3)
+lpf_16_wrappers(BPC, avx)
+
+#define lpf_mix2_wrapper(dir, off, wd1, wd2, bpp, opt) \
+static void loop_filter_##dir##_##wd1##wd2##_##bpp##_##opt(uint8_t *dst, ptrdiff_t stride, \
+ int E, int I, int H) \
+{ \
+ ff_vp9_loop_filter_##dir##_##wd1##_##bpp##_##opt(dst, stride, \
+ E & 0xff, I & 0xff, H & 0xff); \
+ ff_vp9_loop_filter_##dir##_##wd2##_##bpp##_##opt(dst + off, stride, \
+ E >> 8, I >> 8, H >> 8); \
+}
+
+#define lpf_mix2_wrappers(wd1, wd2, bpp, opt) \
+lpf_mix2_wrapper(h, 8 * stride, wd1, wd2, bpp, opt) \
+lpf_mix2_wrapper(v, 16, wd1, wd2, bpp, opt)
+
+#define lpf_mix2_wrappers_set(bpp, opt) \
+lpf_mix2_wrappers(4, 4, bpp, opt) \
+lpf_mix2_wrappers(4, 8, bpp, opt) \
+lpf_mix2_wrappers(8, 4, bpp, opt) \
+lpf_mix2_wrappers(8, 8, bpp, opt) \
+
+lpf_mix2_wrappers_set(BPC, sse2)
+lpf_mix2_wrappers_set(BPC, ssse3)
+lpf_mix2_wrappers_set(BPC, avx)
+
+decl_ipred_fns(tm, BPC, mmxext, sse2);
+
+decl_itxfm_func(iwht, iwht, 4, BPC, mmxext);
+#if BPC == 10
+decl_itxfm_func(idct, idct, 4, BPC, mmxext);
+decl_itxfm_funcs(4, BPC, ssse3);
+#else
+decl_itxfm_func(idct, idct, 4, BPC, sse2);
+#endif
+decl_itxfm_func(idct, iadst, 4, BPC, sse2);
+decl_itxfm_func(iadst, idct, 4, BPC, sse2);
+decl_itxfm_func(iadst, iadst, 4, BPC, sse2);
+decl_itxfm_funcs(8, BPC, sse2);
+decl_itxfm_funcs(16, BPC, sse2);
+decl_itxfm_func(idct, idct, 32, BPC, sse2);
+#endif /* HAVE_X86ASM */
+
+av_cold void INIT_FUNC(VP9DSPContext *dsp, int bitexact)
+{
+#if HAVE_X86ASM
+ int cpu_flags = av_get_cpu_flags();
+
+#define init_lpf_8_func(idx1, idx2, dir, wd, bpp, opt) \
+ dsp->loop_filter_8[idx1][idx2] = ff_vp9_loop_filter_##dir##_##wd##_##bpp##_##opt
+#define init_lpf_16_func(idx, dir, bpp, opt) \
+ dsp->loop_filter_16[idx] = loop_filter_##dir##_16_##bpp##_##opt
+#define init_lpf_mix2_func(idx1, idx2, idx3, dir, wd1, wd2, bpp, opt) \
+ dsp->loop_filter_mix2[idx1][idx2][idx3] = loop_filter_##dir##_##wd1##wd2##_##bpp##_##opt
+
+#define init_lpf_funcs(bpp, opt) \
+ init_lpf_8_func(0, 0, h, 4, bpp, opt); \
+ init_lpf_8_func(0, 1, v, 4, bpp, opt); \
+ init_lpf_8_func(1, 0, h, 8, bpp, opt); \
+ init_lpf_8_func(1, 1, v, 8, bpp, opt); \
+ init_lpf_8_func(2, 0, h, 16, bpp, opt); \
+ init_lpf_8_func(2, 1, v, 16, bpp, opt); \
+ init_lpf_16_func(0, h, bpp, opt); \
+ init_lpf_16_func(1, v, bpp, opt); \
+ init_lpf_mix2_func(0, 0, 0, h, 4, 4, bpp, opt); \
+ init_lpf_mix2_func(0, 1, 0, h, 4, 8, bpp, opt); \
+ init_lpf_mix2_func(1, 0, 0, h, 8, 4, bpp, opt); \
+ init_lpf_mix2_func(1, 1, 0, h, 8, 8, bpp, opt); \
+ init_lpf_mix2_func(0, 0, 1, v, 4, 4, bpp, opt); \
+ init_lpf_mix2_func(0, 1, 1, v, 4, 8, bpp, opt); \
+ init_lpf_mix2_func(1, 0, 1, v, 8, 4, bpp, opt); \
+ init_lpf_mix2_func(1, 1, 1, v, 8, 8, bpp, opt)
+
+#define init_itx_func(idxa, idxb, typea, typeb, size, bpp, opt) \
+ dsp->itxfm_add[idxa][idxb] = \
+ cat(ff_vp9_##typea##_##typeb##_##size##x##size##_add_, bpp, _##opt);
+#define init_itx_func_one(idx, typea, typeb, size, bpp, opt) \
+ init_itx_func(idx, DCT_DCT, typea, typeb, size, bpp, opt); \
+ init_itx_func(idx, ADST_DCT, typea, typeb, size, bpp, opt); \
+ init_itx_func(idx, DCT_ADST, typea, typeb, size, bpp, opt); \
+ init_itx_func(idx, ADST_ADST, typea, typeb, size, bpp, opt)
+#define init_itx_funcs(idx, size, bpp, opt) \
+ init_itx_func(idx, DCT_DCT, idct, idct, size, bpp, opt); \
+ init_itx_func(idx, ADST_DCT, idct, iadst, size, bpp, opt); \
+ init_itx_func(idx, DCT_ADST, iadst, idct, size, bpp, opt); \
+ init_itx_func(idx, ADST_ADST, iadst, iadst, size, bpp, opt); \
+
+ if (EXTERNAL_MMXEXT(cpu_flags)) {
+ init_ipred_func(tm, TM_VP8, 4, BPC, mmxext);
+ if (!bitexact) {
+ init_itx_func_one(4 /* lossless */, iwht, iwht, 4, BPC, mmxext);
+#if BPC == 10
+ init_itx_func(TX_4X4, DCT_DCT, idct, idct, 4, 10, mmxext);
+#endif
+ }
+ }
+
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ init_subpel3(0, put, BPC, sse2);
+ init_subpel3(1, avg, BPC, sse2);
+ init_lpf_funcs(BPC, sse2);
+ init_8_16_32_ipred_funcs(tm, TM_VP8, BPC, sse2);
+#if BPC == 10
+ if (!bitexact) {
+ init_itx_func(TX_4X4, ADST_DCT, idct, iadst, 4, 10, sse2);
+ init_itx_func(TX_4X4, DCT_ADST, iadst, idct, 4, 10, sse2);
+ init_itx_func(TX_4X4, ADST_ADST, iadst, iadst, 4, 10, sse2);
+ }
+#else
+ init_itx_funcs(TX_4X4, 4, 12, sse2);
+#endif
+ init_itx_funcs(TX_8X8, 8, BPC, sse2);
+ init_itx_funcs(TX_16X16, 16, BPC, sse2);
+ init_itx_func_one(TX_32X32, idct, idct, 32, BPC, sse2);
+ }
+
+ if (EXTERNAL_SSSE3(cpu_flags)) {
+ init_lpf_funcs(BPC, ssse3);
+#if BPC == 10
+ if (!bitexact) {
+ init_itx_funcs(TX_4X4, 4, BPC, ssse3);
+ }
+#endif
+ }
+
+ if (EXTERNAL_AVX(cpu_flags)) {
+ init_lpf_funcs(BPC, avx);
+ }
+
+ if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+#if HAVE_AVX2_EXTERNAL
+ init_subpel3_32_64(0, put, BPC, avx2);
+ init_subpel3_32_64(1, avg, BPC, avx2);
+ init_subpel2(2, 0, 16, put, BPC, avx2);
+ init_subpel2(2, 1, 16, avg, BPC, avx2);
+#endif
+ }
+
+#endif /* HAVE_X86ASM */
+
+ ff_vp9dsp_init_16bpp_x86(dsp);
+}
diff --git a/libs/ffvpx/libavcodec/x86/vp9intrapred.asm b/libs/ffvpx/libavcodec/x86/vp9intrapred.asm
new file mode 100644
index 000000000..31f7d449f
--- /dev/null
+++ b/libs/ffvpx/libavcodec/x86/vp9intrapred.asm
@@ -0,0 +1,2044 @@
+;******************************************************************************
+;* VP9 Intra prediction SIMD optimizations
+;*
+;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+;*
+;* Parts based on:
+;* H.264 intra prediction asm optimizations
+;* Copyright (c) 2010 Fiona Glaser
+;* Copyright (c) 2010 Holger Lubitz
+;* Copyright (c) 2010 Loren Merritt
+;* Copyright (c) 2010 Ronald S. Bultje
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pw_m256: times 16 dw -256
+pw_m255: times 16 dw -255
+pw_4096: times 8 dw 4096
+
+pb_4x3_4x2_4x1_4x0: times 4 db 3
+ times 4 db 2
+ times 4 db 1
+ times 4 db 0
+pb_8x1_8x0: times 8 db 1
+ times 8 db 0
+pb_8x3_8x2: times 8 db 3
+ times 8 db 2
+pb_0to5_2x7: db 0, 1, 2, 3, 4, 5, 7, 7
+ times 8 db -1
+pb_0to6_9x7: db 0, 1, 2, 3, 4, 5, 6
+ times 9 db 7
+pb_1to6_10x7: db 1, 2, 3, 4, 5, 6
+ times 10 db 7
+pb_2to6_3x7:
+pb_2to6_11x7: db 2, 3, 4, 5, 6
+ times 11 db 7
+pb_1toE_2xF: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15
+pb_2toE_3xF: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15
+pb_13456_3xm1: db 1, 3, 4, 5, 6
+ times 3 db -1
+pb_6012_4xm1: db 6, 0, 1, 2
+ times 4 db -1
+pb_6xm1_246_8toE: times 6 db -1
+ db 2, 4, 6, 8, 9, 10, 11, 12, 13, 14
+pb_6xm1_BDF_0to6: times 6 db -1
+ db 11, 13, 15, 0, 1, 2, 3, 4, 5, 6
+pb_02468ACE_13579BDF: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+
+pb_15x0_1xm1: times 15 db 0
+ db -1
+pb_0to2_5x3: db 0, 1, 2
+ times 5 db 3
+pb_6xm1_2x0: times 6 db -1
+ times 2 db 0
+pb_6x0_2xm1: times 6 db 0
+ times 2 db -1
+
+cextern pb_1
+cextern pb_2
+cextern pb_3
+cextern pb_15
+cextern pw_2
+cextern pw_4
+cextern pw_8
+cextern pw_16
+cextern pw_32
+cextern pw_255
+cextern pw_512
+cextern pw_1024
+cextern pw_2048
+cextern pw_8192
+
+SECTION .text
+
+; dc_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a)
+
+%macro DC_4to8_FUNCS 0
+cglobal vp9_ipred_dc_4x4, 4, 4, 0, dst, stride, l, a
+ movd m0, [lq]
+ punpckldq m0, [aq]
+ pxor m1, m1
+ psadbw m0, m1
+%if cpuflag(ssse3)
+ pmulhrsw m0, [pw_4096]
+ pshufb m0, m1
+%else
+ paddw m0, [pw_4]
+ psraw m0, 3
+ punpcklbw m0, m0
+ pshufw m0, m0, q0000
+%endif
+ movd [dstq+strideq*0], m0
+ movd [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ movd [dstq+strideq*0], m0
+ movd [dstq+strideq*1], m0
+ RET
+
+cglobal vp9_ipred_dc_8x8, 4, 4, 0, dst, stride, l, a
+ movq m0, [lq]
+ movq m1, [aq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pxor m2, m2
+ psadbw m0, m2
+ psadbw m1, m2
+ paddw m0, m1
+%if cpuflag(ssse3)
+ pmulhrsw m0, [pw_2048]
+ pshufb m0, m2
+%else
+ paddw m0, [pw_8]
+ psraw m0, 4
+ punpcklbw m0, m0
+ pshufw m0, m0, q0000
+%endif
+ movq [dstq+strideq*0], m0
+ movq [dstq+strideq*1], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], m0
+ movq [dstq+strideq*1], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ RET
+%endmacro
+
+INIT_MMX mmxext
+DC_4to8_FUNCS
+INIT_MMX ssse3
+DC_4to8_FUNCS
+
+%macro DC_16to32_FUNCS 0
+cglobal vp9_ipred_dc_16x16, 4, 4, 3, dst, stride, l, a
+ mova m0, [lq]
+ mova m1, [aq]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ pxor m2, m2
+ psadbw m0, m2
+ psadbw m1, m2
+ paddw m0, m1
+ movhlps m1, m0
+ paddw m0, m1
+%if cpuflag(ssse3)
+ pmulhrsw m0, [pw_1024]
+ pshufb m0, m2
+%else
+ paddw m0, [pw_16]
+ psraw m0, 5
+ punpcklbw m0, m0
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+%endif
+ mov cntd, 4
+.loop:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jg .loop
+ RET
+
+cglobal vp9_ipred_dc_32x32, 4, 4, 5, dst, stride, l, a
+ mova m0, [lq]
+ mova m1, [lq+16]
+ mova m2, [aq]
+ mova m3, [aq+16]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ pxor m4, m4
+ psadbw m0, m4
+ psadbw m1, m4
+ psadbw m2, m4
+ psadbw m3, m4
+ paddw m0, m1
+ paddw m2, m3
+ paddw m0, m2
+ movhlps m1, m0
+ paddw m0, m1
+%if cpuflag(ssse3)
+ pmulhrsw m0, [pw_512]
+ pshufb m0, m4
+%else
+ paddw m0, [pw_32]
+ psraw m0, 6
+ punpcklbw m0, m0
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+%endif
+ mov cntd, 8
+.loop:
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m0
+ mova [dstq+strideq*1+ 0], m0
+ mova [dstq+strideq*1+16], m0
+ mova [dstq+strideq*2+ 0], m0
+ mova [dstq+strideq*2+16], m0
+ mova [dstq+stride3q + 0], m0
+ mova [dstq+stride3q +16], m0
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jg .loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+DC_16to32_FUNCS
+INIT_XMM ssse3
+DC_16to32_FUNCS
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_ipred_dc_32x32, 4, 4, 3, dst, stride, l, a
+ mova m0, [lq]
+ mova m1, [aq]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ pxor m2, m2
+ psadbw m0, m2
+ psadbw m1, m2
+ paddw m0, m1
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm1
+ movhlps xm1, xm0
+ paddw xm0, xm1
+ pmulhrsw xm0, [pw_512]
+ vpbroadcastb m0, xm0
+ mov cntd, 4
+.loop:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jg .loop
+ RET
+%endif
+
+; dc_top/left_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a)
+
+%macro DC_1D_4to8_FUNCS 2 ; dir (top or left), arg (a or l)
+cglobal vp9_ipred_dc_%1_4x4, 4, 4, 0, dst, stride, l, a
+ movd m0, [%2q]
+ pxor m1, m1
+ psadbw m0, m1
+%if cpuflag(ssse3)
+ pmulhrsw m0, [pw_8192]
+ pshufb m0, m1
+%else
+ paddw m0, [pw_2]
+ psraw m0, 2
+ punpcklbw m0, m0
+ pshufw m0, m0, q0000
+%endif
+ movd [dstq+strideq*0], m0
+ movd [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ movd [dstq+strideq*0], m0
+ movd [dstq+strideq*1], m0
+ RET
+
+cglobal vp9_ipred_dc_%1_8x8, 4, 4, 0, dst, stride, l, a
+ movq m0, [%2q]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pxor m1, m1
+ psadbw m0, m1
+%if cpuflag(ssse3)
+ pmulhrsw m0, [pw_4096]
+ pshufb m0, m1
+%else
+ paddw m0, [pw_4]
+ psraw m0, 3
+ punpcklbw m0, m0
+ pshufw m0, m0, q0000
+%endif
+ movq [dstq+strideq*0], m0
+ movq [dstq+strideq*1], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], m0
+ movq [dstq+strideq*1], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ RET
+%endmacro
+
+INIT_MMX mmxext
+DC_1D_4to8_FUNCS top, a
+DC_1D_4to8_FUNCS left, l
+INIT_MMX ssse3
+DC_1D_4to8_FUNCS top, a
+DC_1D_4to8_FUNCS left, l
+
+%macro DC_1D_16to32_FUNCS 2; dir (top or left), arg (a or l)
+cglobal vp9_ipred_dc_%1_16x16, 4, 4, 3, dst, stride, l, a
+ mova m0, [%2q]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ pxor m2, m2
+ psadbw m0, m2
+ movhlps m1, m0
+ paddw m0, m1
+%if cpuflag(ssse3)
+ pmulhrsw m0, [pw_2048]
+ pshufb m0, m2
+%else
+ paddw m0, [pw_8]
+ psraw m0, 4
+ punpcklbw m0, m0
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+%endif
+ mov cntd, 4
+.loop:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jg .loop
+ RET
+
+cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
+ mova m0, [%2q]
+ mova m1, [%2q+16]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ pxor m2, m2
+ psadbw m0, m2
+ psadbw m1, m2
+ paddw m0, m1
+ movhlps m1, m0
+ paddw m0, m1
+%if cpuflag(ssse3)
+ pmulhrsw m0, [pw_1024]
+ pshufb m0, m2
+%else
+ paddw m0, [pw_16]
+ psraw m0, 5
+ punpcklbw m0, m0
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+%endif
+ mov cntd, 8
+.loop:
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m0
+ mova [dstq+strideq*1+ 0], m0
+ mova [dstq+strideq*1+16], m0
+ mova [dstq+strideq*2+ 0], m0
+ mova [dstq+strideq*2+16], m0
+ mova [dstq+stride3q + 0], m0
+ mova [dstq+stride3q +16], m0
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jg .loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+DC_1D_16to32_FUNCS top, a
+DC_1D_16to32_FUNCS left, l
+INIT_XMM ssse3
+DC_1D_16to32_FUNCS top, a
+DC_1D_16to32_FUNCS left, l
+
+%macro DC_1D_AVX2_FUNCS 2 ; dir (top or left), arg (a or l)
+%if HAVE_AVX2_EXTERNAL
+cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
+ mova m0, [%2q]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ pxor m2, m2
+ psadbw m0, m2
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm1
+ movhlps xm1, xm0
+ paddw xm0, xm1
+ pmulhrsw xm0, [pw_1024]
+ vpbroadcastb m0, xm0
+ mov cntd, 4
+.loop:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jg .loop
+ RET
+%endif
+%endmacro
+
+INIT_YMM avx2
+DC_1D_AVX2_FUNCS top, a
+DC_1D_AVX2_FUNCS left, l
+
+; v
+
+INIT_MMX mmx
+cglobal vp9_ipred_v_8x8, 4, 4, 0, dst, stride, l, a
+ movq m0, [aq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ movq [dstq+strideq*0], m0
+ movq [dstq+strideq*1], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], m0
+ movq [dstq+strideq*1], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ RET
+
+INIT_XMM sse
+cglobal vp9_ipred_v_16x16, 4, 4, 1, dst, stride, l, a
+ mova m0, [aq]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ mov cntd, 4
+.loop:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jg .loop
+ RET
+
+INIT_XMM sse
+cglobal vp9_ipred_v_32x32, 4, 4, 2, dst, stride, l, a
+ mova m0, [aq]
+ mova m1, [aq+16]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ mov cntd, 8
+.loop:
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m1
+ mova [dstq+strideq*1+ 0], m0
+ mova [dstq+strideq*1+16], m1
+ mova [dstq+strideq*2+ 0], m0
+ mova [dstq+strideq*2+16], m1
+ mova [dstq+stride3q + 0], m0
+ mova [dstq+stride3q +16], m1
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jg .loop
+ RET
+
+INIT_YMM avx
+cglobal vp9_ipred_v_32x32, 4, 4, 1, dst, stride, l, a
+ mova m0, [aq]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ mov cntd, 4
+.loop:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jg .loop
+ RET
+
+; h
+
+%macro H_XMM_FUNCS 2
+%if notcpuflag(avx)
+cglobal vp9_ipred_h_4x4, 3, 4, 1, dst, stride, l, stride3
+ movd m0, [lq]
+%if cpuflag(ssse3)
+ pshufb m0, [pb_4x3_4x2_4x1_4x0]
+%else
+ punpcklbw m0, m0
+ pshuflw m0, m0, q0123
+ punpcklwd m0, m0
+%endif
+ lea stride3q, [strideq*3]
+ movd [dstq+strideq*0], m0
+ psrldq m0, 4
+ movd [dstq+strideq*1], m0
+ psrldq m0, 4
+ movd [dstq+strideq*2], m0
+ psrldq m0, 4
+ movd [dstq+stride3q ], m0
+ RET
+%endif
+
+cglobal vp9_ipred_h_8x8, 3, 5, %1, dst, stride, l, stride3, cnt
+%if cpuflag(ssse3)
+ mova m2, [pb_8x1_8x0]
+ mova m3, [pb_8x3_8x2]
+%endif
+ lea stride3q, [strideq*3]
+ mov cntq, 1
+.loop:
+ movd m0, [lq+cntq*4]
+%if cpuflag(ssse3)
+ pshufb m1, m0, m3
+ pshufb m0, m2
+%else
+ punpcklbw m0, m0
+ punpcklwd m0, m0
+ pshufd m1, m0, q2233
+ pshufd m0, m0, q0011
+%endif
+ movq [dstq+strideq*0], m1
+ movhps [dstq+strideq*1], m1
+ movq [dstq+strideq*2], m0
+ movhps [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec cntq
+ jge .loop
+ RET
+
+cglobal vp9_ipred_h_16x16, 3, 5, %2, dst, stride, l, stride3, cnt
+%if cpuflag(ssse3)
+ mova m5, [pb_1]
+ mova m6, [pb_2]
+ mova m7, [pb_3]
+ pxor m4, m4
+%endif
+ lea stride3q, [strideq*3]
+ mov cntq, 3
+.loop:
+ movd m3, [lq+cntq*4]
+%if cpuflag(ssse3)
+ pshufb m0, m3, m7
+ pshufb m1, m3, m6
+%else
+ punpcklbw m3, m3
+ punpcklwd m3, m3
+ pshufd m0, m3, q3333
+ pshufd m1, m3, q2222
+%endif
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+%if cpuflag(ssse3)
+ pshufb m2, m3, m5
+ pshufb m3, m4
+%else
+ pshufd m2, m3, q1111
+ pshufd m3, m3, q0000
+%endif
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ dec cntq
+ jge .loop
+ RET
+
+cglobal vp9_ipred_h_32x32, 3, 5, %2, dst, stride, l, stride3, cnt
+%if cpuflag(ssse3)
+ mova m5, [pb_1]
+ mova m6, [pb_2]
+ mova m7, [pb_3]
+ pxor m4, m4
+%endif
+ lea stride3q, [strideq*3]
+ mov cntq, 7
+.loop:
+ movd m3, [lq+cntq*4]
+%if cpuflag(ssse3)
+ pshufb m0, m3, m7
+ pshufb m1, m3, m6
+%else
+ punpcklbw m3, m3
+ punpcklwd m3, m3
+ pshufd m0, m3, q3333
+ pshufd m1, m3, q2222
+%endif
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m0
+ mova [dstq+strideq*1+ 0], m1
+ mova [dstq+strideq*1+16], m1
+%if cpuflag(ssse3)
+ pshufb m2, m3, m5
+ pshufb m3, m4
+%else
+ pshufd m2, m3, q1111
+ pshufd m3, m3, q0000
+%endif
+ mova [dstq+strideq*2+ 0], m2
+ mova [dstq+strideq*2+16], m2
+ mova [dstq+stride3q + 0], m3
+ mova [dstq+stride3q +16], m3
+ lea dstq, [dstq+strideq*4]
+ dec cntq
+ jge .loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+H_XMM_FUNCS 2, 4
+INIT_XMM ssse3
+H_XMM_FUNCS 4, 8
+INIT_XMM avx
+H_XMM_FUNCS 4, 8
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt
+ mova m5, [pb_1]
+ mova m6, [pb_2]
+ mova m7, [pb_3]
+ pxor m4, m4
+ lea stride3q, [strideq*3]
+ mov cntq, 7
+.loop:
+ movd xm3, [lq+cntq*4]
+ vinserti128 m3, m3, xm3, 1
+ pshufb m0, m3, m7
+ pshufb m1, m3, m6
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ pshufb m2, m3, m5
+ pshufb m3, m4
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ dec cntq
+ jge .loop
+ RET
+%endif
+
+; tm
+
+%macro TM_MMX_FUNCS 0
+cglobal vp9_ipred_tm_4x4, 4, 4, 0, dst, stride, l, a
+ pxor m1, m1
+ movd m0, [aq]
+ pinsrw m2, [aq-1], 0
+ punpcklbw m0, m1
+ DEFINE_ARGS dst, stride, l, cnt
+%if cpuflag(ssse3)
+ mova m3, [pw_m256]
+ mova m1, [pw_m255]
+ pshufb m2, m3
+%else
+ punpcklbw m2, m1
+ pshufw m2, m2, q0000
+%endif
+ psubw m0, m2
+ mov cntq, 1
+.loop:
+ pinsrw m2, [lq+cntq*2], 0
+%if cpuflag(ssse3)
+ pshufb m4, m2, m1
+ pshufb m2, m3
+%else
+ punpcklbw m2, m1
+ pshufw m4, m2, q1111
+ pshufw m2, m2, q0000
+%endif
+ paddw m4, m0
+ paddw m2, m0
+ packuswb m4, m4
+ packuswb m2, m2
+ movd [dstq+strideq*0], m4
+ movd [dstq+strideq*1], m2
+ lea dstq, [dstq+strideq*2]
+ dec cntq
+ jge .loop
+ RET
+%endmacro
+
+INIT_MMX mmxext
+TM_MMX_FUNCS
+INIT_MMX ssse3
+TM_MMX_FUNCS
+
+%macro TM_XMM_FUNCS 0
+cglobal vp9_ipred_tm_8x8, 4, 4, 5, dst, stride, l, a
+ pxor m1, m1
+ movh m0, [aq]
+ pinsrw m2, [aq-1], 0
+ punpcklbw m0, m1
+ DEFINE_ARGS dst, stride, l, cnt
+%if cpuflag(ssse3)
+ mova m3, [pw_m256]
+ mova m1, [pw_m255]
+ pshufb m2, m3
+%else
+ punpcklbw m2, m1
+ punpcklwd m2, m2
+ pshufd m2, m2, q0000
+%endif
+ psubw m0, m2
+ mov cntq, 3
+.loop:
+ pinsrw m2, [lq+cntq*2], 0
+%if cpuflag(ssse3)
+ pshufb m4, m2, m1
+ pshufb m2, m3
+%else
+ punpcklbw m2, m1
+ punpcklwd m2, m2
+ pshufd m4, m2, q1111
+ pshufd m2, m2, q0000
+%endif
+ paddw m4, m0
+ paddw m2, m0
+ packuswb m4, m2
+ movh [dstq+strideq*0], m4
+ movhps [dstq+strideq*1], m4
+ lea dstq, [dstq+strideq*2]
+ dec cntq
+ jge .loop
+ RET
+
+cglobal vp9_ipred_tm_16x16, 4, 4, 8, dst, stride, l, a
+ pxor m3, m3
+ mova m0, [aq]
+ pinsrw m2, [aq-1], 0
+ punpckhbw m1, m0, m3
+ punpcklbw m0, m3
+ DEFINE_ARGS dst, stride, l, cnt
+%if cpuflag(ssse3)
+ mova m4, [pw_m256]
+ mova m3, [pw_m255]
+ pshufb m2, m4
+%else
+ punpcklbw m2, m3
+ punpcklwd m2, m2
+ pshufd m2, m2, q0000
+%endif
+ psubw m1, m2
+ psubw m0, m2
+ mov cntq, 7
+.loop:
+ pinsrw m7, [lq+cntq*2], 0
+%if cpuflag(ssse3)
+ pshufb m5, m7, m3
+ pshufb m7, m4
+%else
+ punpcklbw m7, m3
+ punpcklwd m7, m7
+ pshufd m5, m7, q1111
+ pshufd m7, m7, q0000
+%endif
+ paddw m2, m5, m0
+ paddw m5, m1
+ paddw m6, m7, m0
+ paddw m7, m1
+ packuswb m2, m5
+ packuswb m6, m7
+ mova [dstq+strideq*0], m2
+ mova [dstq+strideq*1], m6
+ lea dstq, [dstq+strideq*2]
+ dec cntq
+ jge .loop
+ RET
+
+%if ARCH_X86_64
+%define mem 0
+%else
+%define mem 64
+%endif
+cglobal vp9_ipred_tm_32x32, 4, 4, 14, mem, dst, stride, l, a
+ pxor m5, m5
+ pinsrw m4, [aq-1], 0
+ mova m0, [aq]
+ mova m2, [aq+16]
+ DEFINE_ARGS dst, stride, l, cnt
+%if cpuflag(ssse3)
+%if ARCH_X86_64
+ mova m12, [pw_m256]
+ mova m13, [pw_m255]
+%define pw_m256_reg m12
+%define pw_m255_reg m13
+%else
+%define pw_m256_reg [pw_m256]
+%define pw_m255_reg [pw_m255]
+%endif
+ pshufb m4, pw_m256_reg
+%else
+ punpcklbw m4, m5
+ punpcklwd m4, m4
+ pshufd m4, m4, q0000
+%endif
+ punpckhbw m1, m0, m5
+ punpckhbw m3, m2, m5
+ punpcklbw m0, m5
+ punpcklbw m2, m5
+ psubw m1, m4
+ psubw m0, m4
+ psubw m3, m4
+ psubw m2, m4
+%if ARCH_X86_64
+ SWAP 0, 8
+ SWAP 1, 9
+ SWAP 2, 10
+ SWAP 3, 11
+%else
+ mova [rsp+0*16], m0
+ mova [rsp+1*16], m1
+ mova [rsp+2*16], m2
+ mova [rsp+3*16], m3
+%endif
+ mov cntq, 15
+.loop:
+ pinsrw m3, [lq+cntq*2], 0
+%if cpuflag(ssse3)
+ pshufb m7, m3, pw_m255_reg
+ pshufb m3, pw_m256_reg
+%else
+ pxor m7, m7
+ punpcklbw m3, m7
+ punpcklwd m3, m3
+ pshufd m7, m3, q1111
+ pshufd m3, m3, q0000
+%endif
+%if ARCH_X86_64
+ paddw m4, m7, m8
+ paddw m5, m7, m9
+ paddw m6, m7, m10
+ paddw m7, m11
+ paddw m0, m3, m8
+ paddw m1, m3, m9
+ paddw m2, m3, m10
+ paddw m3, m11
+%else
+ paddw m4, m7, [rsp+0*16]
+ paddw m5, m7, [rsp+1*16]
+ paddw m6, m7, [rsp+2*16]
+ paddw m7, [rsp+3*16]
+ paddw m0, m3, [rsp+0*16]
+ paddw m1, m3, [rsp+1*16]
+ paddw m2, m3, [rsp+2*16]
+ paddw m3, [rsp+3*16]
+%endif
+ packuswb m4, m5
+ packuswb m6, m7
+ packuswb m0, m1
+ packuswb m2, m3
+ mova [dstq+strideq*0+ 0], m4
+ mova [dstq+strideq*0+16], m6
+ mova [dstq+strideq*1+ 0], m0
+ mova [dstq+strideq*1+16], m2
+ lea dstq, [dstq+strideq*2]
+ dec cntq
+ jge .loop
+ RET
+%undef pw_m256_reg
+%undef pw_m255_reg
+%undef mem
+%endmacro
+
+INIT_XMM sse2
+TM_XMM_FUNCS
+INIT_XMM ssse3
+TM_XMM_FUNCS
+INIT_XMM avx
+TM_XMM_FUNCS
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_ipred_tm_32x32, 4, 4, 8, dst, stride, l, a
+ pxor m3, m3
+ pinsrw xm2, [aq-1], 0
+ vinserti128 m2, m2, xm2, 1
+ mova m0, [aq]
+ DEFINE_ARGS dst, stride, l, cnt
+ mova m4, [pw_m256]
+ mova m5, [pw_m255]
+ pshufb m2, m4
+ punpckhbw m1, m0, m3
+ punpcklbw m0, m3
+ psubw m1, m2
+ psubw m0, m2
+ mov cntq, 15
+.loop:
+ pinsrw xm7, [lq+cntq*2], 0
+ vinserti128 m7, m7, xm7, 1
+ pshufb m3, m7, m5
+ pshufb m7, m4
+ paddw m2, m3, m0
+ paddw m3, m1
+ paddw m6, m7, m0
+ paddw m7, m1
+ packuswb m2, m3
+ packuswb m6, m7
+ mova [dstq+strideq*0], m2
+ mova [dstq+strideq*1], m6
+ lea dstq, [dstq+strideq*2]
+ dec cntq
+ jge .loop
+ RET
+%endif
+
+; dl
+
+%macro LOWPASS 4 ; left [dst], center, right, tmp
+ pxor m%4, m%1, m%3
+ pand m%4, [pb_1]
+ pavgb m%1, m%3
+ psubusb m%1, m%4
+ pavgb m%1, m%2
+%endmacro
+
+%macro DL_MMX_FUNCS 0
+cglobal vp9_ipred_dl_4x4, 4, 4, 0, dst, stride, l, a
+ movq m1, [aq]
+%if cpuflag(ssse3)
+ pshufb m0, m1, [pb_0to5_2x7]
+ pshufb m2, m1, [pb_2to6_3x7]
+%else
+ punpckhbw m3, m1, m1 ; 44556677
+ pand m0, m1, [pb_6xm1_2x0] ; 012345__
+ pand m3, [pb_6x0_2xm1] ; ______77
+ psrlq m2, m1, 16 ; 234567__
+ por m0, m3 ; 01234577
+ por m2, m3 ; 23456777
+%endif
+ psrlq m1, 8
+ LOWPASS 0, 1, 2, 3
+
+ pshufw m1, m0, q3321
+ movd [dstq+strideq*0], m0
+ movd [dstq+strideq*2], m1
+ psrlq m0, 8
+ psrlq m1, 8
+ add dstq, strideq
+ movd [dstq+strideq*0], m0
+ movd [dstq+strideq*2], m1
+ RET
+%endmacro
+
+INIT_MMX mmxext
+DL_MMX_FUNCS
+INIT_MMX ssse3
+DL_MMX_FUNCS
+
+%macro DL_XMM_FUNCS 0
+cglobal vp9_ipred_dl_8x8, 4, 4, 4, dst, stride, stride5, a
+ movq m0, [aq]
+ lea stride5q, [strideq*5]
+%if cpuflag(ssse3)
+ pshufb m1, m0, [pb_1to6_10x7]
+%else
+ punpcklbw m1, m0, m0 ; 0011223344556677
+ punpckhwd m1, m1 ; 4x4,4x5,4x6,4x7
+%endif
+ shufps m0, m1, q3310
+%if notcpuflag(ssse3)
+ psrldq m1, m0, 1
+ shufps m1, m0, q3210
+%endif
+ psrldq m2, m1, 1
+ LOWPASS 0, 1, 2, 3
+
+ pshufd m1, m0, q3321
+ movq [dstq+strideq*0], m0
+ movq [dstq+strideq*4], m1
+ psrldq m0, 1
+ psrldq m1, 1
+ movq [dstq+strideq*1], m0
+ movq [dstq+stride5q ], m1
+ lea dstq, [dstq+strideq*2]
+ psrldq m0, 1
+ psrldq m1, 1
+ movq [dstq+strideq*0], m0
+ movq [dstq+strideq*4], m1
+ psrldq m0, 1
+ psrldq m1, 1
+ movq [dstq+strideq*1], m0
+ movq [dstq+stride5q ], m1
+ RET
+
+cglobal vp9_ipred_dl_16x16, 4, 4, 6, dst, stride, l, a
+ mova m0, [aq]
+%if cpuflag(ssse3)
+ mova m5, [pb_1toE_2xF]
+ pshufb m1, m0, m5
+ pshufb m2, m1, m5
+ pshufb m4, m0, [pb_15]
+%else
+ pand m5, m0, [pb_15x0_1xm1] ; _______________F
+ psrldq m1, m0, 1 ; 123456789ABCDEF_
+ por m1, m5 ; 123456789ABCDEFF
+ psrldq m2, m1, 1 ; 23456789ABCDEFF_
+ por m2, m5 ; 23456789ABCDEFFF
+ pshufhw m4, m1, q3333 ; xxxxxxxxFFFFFFFF
+%endif
+ LOWPASS 0, 1, 2, 3
+ DEFINE_ARGS dst, stride, cnt, stride9
+ lea stride9q, [strideq+strideq*8]
+ mov cntd, 4
+
+.loop:
+ movhlps m4, m0
+ mova [dstq+strideq*0], m0
+%if cpuflag(ssse3)
+ pshufb m0, m5
+%else
+ psrldq m0, 1
+ por m0, m5
+%endif
+ mova [dstq+strideq*8], m4
+ movhlps m4, m0
+ mova [dstq+strideq*1], m0
+%if cpuflag(ssse3)
+ pshufb m0, m5
+%else
+ psrldq m0, 1
+ por m0, m5
+%endif
+ mova [dstq+stride9q ], m4
+ lea dstq, [dstq+strideq*2]
+ dec cntd
+ jg .loop
+ RET
+
+cglobal vp9_ipred_dl_32x32, 4, 5, 8, dst, stride, cnt, a, dst16
+ mova m0, [aq]
+ mova m1, [aq+16]
+ PALIGNR m2, m1, m0, 1, m4
+ PALIGNR m3, m1, m0, 2, m4
+ LOWPASS 0, 2, 3, 4
+%if cpuflag(ssse3)
+ mova m5, [pb_1toE_2xF]
+ pshufb m2, m1, m5
+ pshufb m3, m2, m5
+ pshufb m6, m1, [pb_15]
+ mova m7, m6
+%else
+ pand m5, m1, [pb_15x0_1xm1] ; _______________F
+ psrldq m2, m1, 1 ; 123456789ABCDEF_
+ por m2, m5 ; 123456789ABCDEFF
+ psrldq m3, m2, 1 ; 23456789ABCDEFF_
+ por m3, m5 ; 23456789ABCDEFFF
+ pshufhw m7, m2, q3333 ; xxxxxxxxFFFFFFFF
+ pshufd m6, m7, q3333
+%endif
+ LOWPASS 1, 2, 3, 4
+ lea dst16q, [dstq +strideq*8]
+ mov cntd, 8
+ lea dst16q, [dst16q+strideq*8]
+.loop:
+ movhlps m7, m1
+ mova [dstq +strideq*0+ 0], m0
+ mova [dstq +strideq*0+16], m1
+ movhps [dstq+strideq*8+ 0], m0
+ movq [dstq +strideq*8+ 8], m1
+ mova [dstq +strideq*8+16], m7
+ mova [dst16q+strideq*0+ 0], m1
+ mova [dst16q+strideq*0+16], m6
+ mova [dst16q+strideq*8+ 0], m7
+ mova [dst16q+strideq*8+16], m6
+%if cpuflag(avx)
+ vpalignr m0, m1, m0, 1
+ pshufb m1, m5
+%elif cpuflag(ssse3)
+ palignr m2, m1, m0, 1
+ pshufb m1, m5
+ mova m0, m2
+%else
+ mova m4, m1
+ psrldq m0, 1
+ pslldq m4, 15
+ psrldq m1, 1
+ por m0, m4
+ por m1, m5
+%endif
+ add dstq, strideq
+ add dst16q, strideq
+ dec cntd
+ jg .loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+DL_XMM_FUNCS
+INIT_XMM ssse3
+DL_XMM_FUNCS
+INIT_XMM avx
+DL_XMM_FUNCS
+
+; dr
+
+%macro DR_MMX_FUNCS 0
+cglobal vp9_ipred_dr_4x4, 4, 4, 0, dst, stride, l, a
+ movd m0, [lq]
+ punpckldq m0, [aq-1]
+ movd m1, [aq+3]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ PALIGNR m1, m0, 1, m3
+ psrlq m2, m1, 8
+ LOWPASS 0, 1, 2, 3
+
+ movd [dstq+stride3q ], m0
+ psrlq m0, 8
+ movd [dstq+strideq*2], m0
+ psrlq m0, 8
+ movd [dstq+strideq*1], m0
+ psrlq m0, 8
+ movd [dstq+strideq*0], m0
+ RET
+%endmacro
+
+INIT_MMX mmxext
+DR_MMX_FUNCS
+INIT_MMX ssse3
+DR_MMX_FUNCS
+
+%macro DR_XMM_FUNCS 0
+cglobal vp9_ipred_dr_8x8, 4, 4, 4, dst, stride, l, a
+ movq m1, [lq]
+ movhps m1, [aq-1]
+ movd m2, [aq+7]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pslldq m0, m1, 1
+ PALIGNR m2, m1, 1, m3
+ LOWPASS 0, 1, 2, 3
+
+ movhps [dstq+strideq*0], m0
+ pslldq m0, 1
+ movhps [dstq+strideq*1], m0
+ pslldq m0, 1
+ movhps [dstq+strideq*2], m0
+ pslldq m0, 1
+ movhps [dstq+stride3q ], m0
+ pslldq m0, 1
+ lea dstq, [dstq+strideq*4]
+ movhps [dstq+strideq*0], m0
+ pslldq m0, 1
+ movhps [dstq+strideq*1], m0
+ pslldq m0, 1
+ movhps [dstq+strideq*2], m0
+ pslldq m0, 1
+ movhps [dstq+stride3q ], m0
+ RET
+
+cglobal vp9_ipred_dr_16x16, 4, 4, 6, dst, stride, l, a
+ mova m1, [lq]
+ movu m2, [aq-1]
+ movd m4, [aq+15]
+ DEFINE_ARGS dst, stride, stride9, cnt
+ lea stride9q, [strideq *3]
+ mov cntd, 4
+ lea stride9q, [stride9q*3]
+ PALIGNR m4, m2, 1, m5
+ PALIGNR m3, m2, m1, 15, m5
+ LOWPASS 3, 2, 4, 5
+ pslldq m0, m1, 1
+ PALIGNR m2, m1, 1, m4
+ LOWPASS 0, 1, 2, 4
+
+.loop:
+ mova [dstq+strideq*0 ], m3
+ movhps [dstq+strideq*8+0], m0
+ movq [dstq+strideq*8+8], m3
+ PALIGNR m3, m0, 15, m1
+ pslldq m0, 1
+ mova [dstq+strideq*1 ], m3
+ movhps [dstq+stride9q +0], m0
+ movq [dstq+stride9q +8], m3
+ PALIGNR m3, m0, 15, m1
+ pslldq m0, 1
+ lea dstq, [dstq+strideq*2]
+ dec cntd
+ jg .loop
+ RET
+
+cglobal vp9_ipred_dr_32x32, 4, 4, 8, dst, stride, l, a
+ mova m1, [lq]
+ mova m2, [lq+16]
+ movu m3, [aq-1]
+ movu m4, [aq+15]
+ movd m5, [aq+31]
+ DEFINE_ARGS dst, stride, stride8, cnt
+ lea stride8q, [strideq*8]
+ PALIGNR m5, m4, 1, m7
+ PALIGNR m6, m4, m3, 15, m7
+ LOWPASS 5, 4, 6, 7
+ PALIGNR m4, m3, 1, m7
+ PALIGNR m6, m3, m2, 15, m7
+ LOWPASS 4, 3, 6, 7
+ PALIGNR m3, m2, 1, m7
+ PALIGNR m6, m2, m1, 15, m7
+ LOWPASS 3, 2, 6, 7
+ PALIGNR m2, m1, 1, m6
+ pslldq m0, m1, 1
+ LOWPASS 2, 1, 0, 6
+ mov cntd, 16
+
+ ; out=m2/m3/m4/m5
+.loop:
+ mova [dstq+stride8q*0+ 0], m4
+ mova [dstq+stride8q*0+16], m5
+ mova [dstq+stride8q*2+ 0], m3
+ mova [dstq+stride8q*2+16], m4
+ PALIGNR m5, m4, 15, m6
+ PALIGNR m4, m3, 15, m6
+ PALIGNR m3, m2, 15, m6
+ pslldq m2, 1
+ add dstq, strideq
+ dec cntd
+ jg .loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+DR_XMM_FUNCS
+INIT_XMM ssse3
+DR_XMM_FUNCS
+INIT_XMM avx
+DR_XMM_FUNCS
+
+; vl
+
+INIT_MMX mmxext
+cglobal vp9_ipred_vl_4x4, 4, 4, 0, dst, stride, l, a
+ movq m0, [aq]
+ psrlq m1, m0, 8
+ psrlq m2, m1, 8
+ LOWPASS 2, 1, 0, 3
+ pavgb m1, m0
+ movd [dstq+strideq*0], m1
+ movd [dstq+strideq*1], m2
+ lea dstq, [dstq+strideq*2]
+ psrlq m1, 8
+ psrlq m2, 8
+ movd [dstq+strideq*0], m1
+ movd [dstq+strideq*1], m2
+ RET
+
+%macro VL_XMM_FUNCS 0
+cglobal vp9_ipred_vl_8x8, 4, 4, 4, dst, stride, l, a
+ movq m0, [aq]
+%if cpuflag(ssse3)
+ pshufb m0, [pb_0to6_9x7]
+%else
+ punpcklbw m1, m0, m0
+ punpckhwd m1, m1
+ shufps m0, m1, q3310
+%endif
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ psrldq m1, m0, 1
+ psrldq m2, m0, 2
+ LOWPASS 2, 1, 0, 3
+ pavgb m1, m0
+
+ movq [dstq+strideq*0], m1
+ movq [dstq+strideq*1], m2
+ psrldq m1, 1
+ psrldq m2, 1
+ movq [dstq+strideq*2], m1
+ movq [dstq+stride3q ], m2
+ lea dstq, [dstq+strideq*4]
+ psrldq m1, 1
+ psrldq m2, 1
+ movq [dstq+strideq*0], m1
+ movq [dstq+strideq*1], m2
+ psrldq m1, 1
+ psrldq m2, 1
+ movq [dstq+strideq*2], m1
+ movq [dstq+stride3q ], m2
+ RET
+
+cglobal vp9_ipred_vl_16x16, 4, 4, 5, dst, stride, l, a
+ mova m0, [aq]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+%if cpuflag(ssse3)
+ mova m4, [pb_1toE_2xF]
+ pshufb m1, m0, m4
+ pshufb m2, m1, m4
+%else
+ pand m4, m0, [pb_15x0_1xm1] ; _______________F
+ psrldq m1, m0, 1 ; 123456789ABCDEF_
+ por m1, m4 ; 123456789ABCDEFF
+ psrldq m2, m1, 1 ; 23456789ABCDEFF_
+ por m2, m4 ; 23456789ABCDEFFF
+%endif
+ LOWPASS 2, 1, 0, 3
+ pavgb m1, m0
+ mov cntd, 4
+.loop:
+ mova [dstq+strideq*0], m1
+ mova [dstq+strideq*1], m2
+%if cpuflag(ssse3)
+ pshufb m1, m4
+ pshufb m2, m4
+%else
+ psrldq m1, 1
+ psrldq m2, 1
+ por m1, m4
+ por m2, m4
+%endif
+ mova [dstq+strideq*2], m1
+ mova [dstq+stride3q ], m2
+%if cpuflag(ssse3)
+ pshufb m1, m4
+ pshufb m2, m4
+%else
+ psrldq m1, 1
+ psrldq m2, 1
+ por m1, m4
+ por m2, m4
+%endif
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jg .loop
+ RET
+
+cglobal vp9_ipred_vl_32x32, 4, 4, 7, dst, stride, l, a
+ mova m0, [aq]
+ mova m5, [aq+16]
+ DEFINE_ARGS dst, stride, dst16, cnt
+ PALIGNR m2, m5, m0, 1, m4
+ PALIGNR m3, m5, m0, 2, m4
+ lea dst16q, [dstq +strideq*8]
+ LOWPASS 3, 2, 0, 6
+ pavgb m2, m0
+%if cpuflag(ssse3)
+ mova m4, [pb_1toE_2xF]
+ pshufb m0, m5, m4
+ pshufb m1, m0, m4
+%else
+ pand m4, m5, [pb_15x0_1xm1] ; _______________F
+ psrldq m0, m5, 1 ; 123456789ABCDEF_
+ por m0, m4 ; 123456789ABCDEFF
+ psrldq m1, m0, 1 ; 23456789ABCDEFF_
+ por m1, m4 ; 23456789ABCDEFFF
+%endif
+ lea dst16q, [dst16q+strideq*8]
+ LOWPASS 1, 0, 5, 6
+ pavgb m0, m5
+%if cpuflag(ssse3)
+ pshufb m5, [pb_15]
+%else
+ punpckhbw m5, m4, m4
+ pshufhw m5, m5, q3333
+ punpckhqdq m5, m5
+%endif
+ mov cntd, 8
+
+.loop:
+%macro %%write 3
+ mova [dstq+stride%1+ 0], %2
+ mova [dstq+stride%1+16], %3
+ movhps [dst16q+stride%1 ], %2
+ movu [dst16q+stride%1+ 8], %3
+ movq [dst16q+stride%1+24], m5
+%if cpuflag(avx)
+ palignr %2, %3, %2, 1
+ pshufb %3, m4
+%elif cpuflag(ssse3)
+ palignr m6, %3, %2, 1
+ pshufb %3, m4
+ mova %2, m6
+%else
+ pslldq m6, %3, 15
+ psrldq %3, 1
+ psrldq %2, 1
+ por %3, m4
+ por %2, m6
+%endif
+%endmacro
+
+ %%write q*0, m2, m0
+ %%write q*1, m3, m1
+ lea dstq, [dstq +strideq*2]
+ lea dst16q, [dst16q+strideq*2]
+ dec cntd
+ jg .loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+VL_XMM_FUNCS
+INIT_XMM ssse3
+VL_XMM_FUNCS
+INIT_XMM avx
+VL_XMM_FUNCS
+
+; vr
+
+%macro VR_MMX_FUNCS 0
+cglobal vp9_ipred_vr_4x4, 4, 4, 0, dst, stride, l, a
+ movq m1, [aq-1]
+ punpckldq m2, [lq]
+ movd m0, [aq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pavgb m0, m1
+ PALIGNR m1, m2, 5, m3
+ psrlq m2, m1, 8
+ psllq m3, m1, 8
+ LOWPASS 2, 1, 3, 4
+
+ ; ABCD <- for the following predictor:
+ ; EFGH
+ ; IABC | m0 contains ABCDxxxx
+ ; JEFG | m2 contains xJIEFGHx
+
+%if cpuflag(ssse3)
+ punpckldq m0, m2
+ pshufb m2, [pb_13456_3xm1]
+ movd [dstq+strideq*0], m0
+ pshufb m0, [pb_6012_4xm1]
+ movd [dstq+stride3q ], m2
+ psrlq m2, 8
+ movd [dstq+strideq*2], m0
+ movd [dstq+strideq*1], m2
+%else
+ psllq m1, m2, 40
+ psrlq m2, 24
+ movd [dstq+strideq*0], m0
+ movd [dstq+strideq*1], m2
+ PALIGNR m0, m1, 7, m3
+ psllq m1, 8
+ PALIGNR m2, m1, 7, m3
+ movd [dstq+strideq*2], m0
+ movd [dstq+stride3q ], m2
+%endif
+ RET
+%endmacro
+
+INIT_MMX mmxext
+VR_MMX_FUNCS
+INIT_MMX ssse3
+VR_MMX_FUNCS
+
+%macro VR_XMM_FUNCS 1 ; n_xmm_regs for 16x16
+cglobal vp9_ipred_vr_8x8, 4, 4, 5, dst, stride, l, a
+ movu m1, [aq-1]
+ movhps m2, [lq]
+ movq m0, [aq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pavgb m0, m1
+ PALIGNR m1, m2, 9, m3
+ pslldq m2, m1, 1
+ pslldq m3, m1, 2
+ LOWPASS 1, 2, 3, 4
+
+ ; ABCDEFGH <- for the following predictor:
+ ; IJKLMNOP
+ ; QABCDEFG | m0 contains ABCDEFGHxxxxxxxx
+ ; RIJKLMNO | m1 contains xxVUTSRQIJKLMNOP
+ ; SQABCDEF
+ ; TRIJKLMN
+ ; USQABCDE
+ ; VTRIJKLM
+
+%if cpuflag(ssse3)
+ punpcklqdq m0, m1 ; ABCDEFGHxxVUTSRQ
+%endif
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m1
+%if cpuflag(ssse3)
+ pshufb m0, [pb_6xm1_BDF_0to6] ; xxxxxxUSQABCDEFG
+ pshufb m1, [pb_6xm1_246_8toE] ; xxxxxxVTRIJKLMNO
+%else
+ psrlw m2, m1, 8 ; x_U_S_Q_xxxxxxxx
+ pand m3, m1, [pw_255] ; x_V_T_R_xxxxxxxx
+ packuswb m3, m2 ; xVTRxxxxxUSQxxxx
+ pslldq m3, 4 ; xxxxxVTRxxxxxUSQ
+ PALIGNR m0, m3, 7, m4 ; xxxxxxUSQABCDEFG
+ psrldq m1, 8
+ pslldq m3, 8
+ PALIGNR m1, m3, 7, m4 ; xxxxxxVTRIJKLMNO
+%endif
+ movhps [dstq+strideq*2], m0
+ movhps [dstq+stride3q ], m1
+ lea dstq, [dstq+strideq*4]
+ pslldq m0, 1
+ pslldq m1, 1
+ movhps [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m1
+ pslldq m0, 1
+ pslldq m1, 1
+ movhps [dstq+strideq*2], m0
+ movhps [dstq+stride3q ], m1
+ RET
+
+cglobal vp9_ipred_vr_16x16, 4, 4, %1, dst, stride, l, a
+ mova m0, [aq]
+ movu m1, [aq-1]
+ mova m2, [lq]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ PALIGNR m3, m1, m2, 15, m6
+ LOWPASS 3, 1, 0, 4
+ pavgb m0, m1
+ PALIGNR m1, m2, 1, m6
+ pslldq m4, m2, 1
+ LOWPASS 1, 2, 4, 5
+%if cpuflag(ssse3)
+ pshufb m1, [pb_02468ACE_13579BDF]
+%else
+ psrlw m5, m1, 8
+ pand m1, [pw_255]
+ packuswb m1, m5
+%endif
+ mov cntd, 4
+
+.loop:
+ movlhps m2, m1
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m3
+ PALIGNR m4, m0, m1, 15, m6
+ PALIGNR m5, m3, m2, 15, m6
+ mova [dstq+strideq*2], m4
+ mova [dstq+stride3q ], m5
+ lea dstq, [dstq+strideq*4]
+ PALIGNR m0, m1, 14, m6
+ PALIGNR m3, m2, 14, m6
+ pslldq m1, 2
+ dec cntd
+ jg .loop
+ RET
+
+cglobal vp9_ipred_vr_32x32, 4, 4, 9, dst, stride, l, a
+ mova m0, [aq]
+ mova m2, [aq+16]
+ movu m1, [aq-1]
+ PALIGNR m3, m2, m0, 15, m6
+ PALIGNR m4, m2, m0, 14, m6
+ LOWPASS 4, 3, 2, 5
+ pavgb m3, m2
+ mova m2, [lq+16]
+ PALIGNR m5, m1, m2, 15, m6
+ LOWPASS 5, 1, 0, 6
+ pavgb m0, m1
+ mova m6, [lq]
+%if ARCH_X86_64
+ SWAP 0, 8
+%else
+ mova [dstq], m0
+%endif
+ PALIGNR m1, m2, 1, m0
+ PALIGNR m7, m2, m6, 15, m0
+ LOWPASS 1, 2, 7, 0
+ PALIGNR m2, m6, 1, m0
+ pslldq m7, m6, 1
+ LOWPASS 2, 6, 7, 0
+%if cpuflag(ssse3)
+ pshufb m1, [pb_02468ACE_13579BDF]
+ pshufb m2, [pb_02468ACE_13579BDF]
+%else
+ psrlw m0, m1, 8
+ psrlw m6, m2, 8
+ pand m1, [pw_255]
+ pand m2, [pw_255]
+ packuswb m1, m0
+ packuswb m2, m6
+%endif
+ DEFINE_ARGS dst, stride, dst16, cnt
+ lea dst16q, [dstq +strideq*8]
+ lea dst16q, [dst16q+strideq*8]
+ SBUTTERFLY qdq, 2, 1, 6
+%if ARCH_X86_64
+ SWAP 0, 8
+%else
+ mova m0, [dstq]
+%endif
+ mov cntd, 8
+
+.loop:
+ ; even lines (0, 2, 4, ...): m1 | m0, m3
+ ; odd lines (1, 3, 5, ...): m2 | m5, m4
+%macro %%write 4
+ mova [dstq+stride%1+ 0], %3
+ mova [dstq+stride%1+16], %4
+ movhps [dst16q+stride%1 ], %2
+ movu [dst16q+stride%1+ 8], %3
+ movq [dst16q+stride%1+24], %4
+ PALIGNR %4, %3, 15, m6
+ PALIGNR %3, %2, 15, m6
+ pslldq %2, 1
+%endmacro
+
+ %%write q*0, m1, m0, m3
+ %%write q*1, m2, m5, m4
+ lea dstq, [dstq +strideq*2]
+ lea dst16q, [dst16q+strideq*2]
+ dec cntd
+ jg .loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+VR_XMM_FUNCS 7
+INIT_XMM ssse3
+VR_XMM_FUNCS 6
+INIT_XMM avx
+VR_XMM_FUNCS 6
+
+; hd
+
+INIT_MMX mmxext
+cglobal vp9_ipred_hd_4x4, 4, 4, 0, dst, stride, l, a
+ movd m0, [lq]
+ punpckldq m0, [aq-1]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ psrlq m1, m0, 8
+ psrlq m2, m1, 8
+ LOWPASS 2, 1, 0, 3
+ pavgb m1, m0
+
+ ; DHIJ <- for the following predictor:
+ ; CGDH
+ ; BFCG | m1 contains ABCDxxxx
+ ; AEBF | m2 contains EFGHIJxx
+
+ punpcklbw m1, m2
+ punpckhdq m0, m1, m2
+
+ ; m1 contains AEBFCGDH
+ ; m0 contains CGDHIJxx
+
+ movd [dstq+stride3q ], m1
+ movd [dstq+strideq*1], m0
+ psrlq m1, 16
+ psrlq m0, 16
+ movd [dstq+strideq*2], m1
+ movd [dstq+strideq*0], m0
+ RET
+
+%macro HD_XMM_FUNCS 0
+cglobal vp9_ipred_hd_8x8, 4, 4, 5, dst, stride, l, a
+ movq m0, [lq]
+ movhps m0, [aq-1]
+ DEFINE_ARGS dst, stride, stride3, dst4
+ lea stride3q, [strideq*3]
+ lea dst4q, [dstq+strideq*4]
+ psrldq m1, m0, 1
+ psrldq m2, m1, 1
+ LOWPASS 2, 1, 0, 3
+ pavgb m1, m0
+
+ ; HPQRSTUV <- for the following predictor
+ ; GOHPQRST
+ ; FNGOHPQR | m1 contains ABCDEFGHxxxxxxxx
+ ; EMFNGOHP | m2 contains IJKLMNOPQRSTUVxx
+ ; DLEMFNGO
+ ; CKDLEMFN
+ ; BJCKDLEM
+ ; AIBJCKDL
+
+ punpcklbw m1, m2
+ movhlps m2, m2
+
+ ; m1 contains AIBJCKDLEMFNGOHP
+ ; m2 contains QRSTUVxxxxxxxxxx
+
+ movhps [dstq +stride3q ], m1
+ movq [dst4q+stride3q ], m1
+ PALIGNR m3, m2, m1, 2, m4
+ movhps [dstq +strideq*2], m3
+ movq [dst4q+strideq*2], m3
+ PALIGNR m3, m2, m1, 4, m4
+ movhps [dstq +strideq*1], m3
+ movq [dst4q+strideq*1], m3
+ PALIGNR m2, m1, 6, m4
+ movhps [dstq +strideq*0], m2
+ movq [dst4q+strideq*0], m2
+ RET
+
+cglobal vp9_ipred_hd_16x16, 4, 6, 7, dst, stride, l, a
+ mova m0, [lq]
+ movu m3, [aq-1]
+ DEFINE_ARGS dst, stride, stride4, dst4, dst8, dst12
+ lea stride4q, [strideq*4]
+ lea dst4q, [dstq +stride4q]
+ lea dst8q, [dst4q+stride4q]
+ lea dst12q, [dst8q+stride4q]
+ psrldq m4, m3, 1
+ psrldq m5, m3, 2
+ LOWPASS 5, 4, 3, 6
+ PALIGNR m1, m3, m0, 1, m6
+ PALIGNR m2, m3, m0, 2, m6
+ LOWPASS 2, 1, 0, 6
+ pavgb m1, m0
+ SBUTTERFLY bw, 1, 2, 6
+
+ ; I PROBABLY INVERTED L0 ad L16 here
+ ; m1, m2, m5
+.loop:
+ sub stride4q, strideq
+ movhps [dstq +stride4q +0], m2
+ movq [dstq +stride4q +8], m5
+ mova [dst4q+stride4q ], m2
+ movhps [dst8q+stride4q +0], m1
+ movq [dst8q+stride4q +8], m2
+ mova [dst12q+stride4q ], m1
+%if cpuflag(avx)
+ palignr m1, m2, m1, 2
+ palignr m2, m5, m2, 2
+%elif cpuflag(ssse3)
+ palignr m3, m2, m1, 2
+ palignr m0, m5, m2, 2
+ mova m1, m3
+ mova m2, m0
+%else
+ ; slightly modified version of PALIGNR
+ mova m6, m2
+ mova m4, m5
+ pslldq m6, 14
+ pslldq m4, 14
+ psrldq m1, 2
+ psrldq m2, 2
+ por m1, m6
+ por m2, m4
+%endif
+ psrldq m5, 2
+ jg .loop
+ RET
+
+cglobal vp9_ipred_hd_32x32, 4, 6, 8, dst, stride, l, a
+ mova m0, [lq]
+ mova m1, [lq+16]
+ movu m2, [aq-1]
+ movu m3, [aq+15]
+ DEFINE_ARGS dst, stride, stride8, dst8, dst16, dst24
+ lea stride8q, [strideq*8]
+ lea dst8q, [dstq +stride8q]
+ lea dst16q, [dst8q +stride8q]
+ lea dst24q, [dst16q+stride8q]
+ psrldq m4, m3, 1
+ psrldq m5, m3, 2
+ LOWPASS 5, 4, 3, 6
+ PALIGNR m4, m3, m2, 2, m6
+ PALIGNR m3, m2, 1, m6
+ LOWPASS 4, 3, 2, 6
+ PALIGNR m3, m2, m1, 2, m6
+ PALIGNR m2, m1, 1, m6
+ LOWPASS 3, 2, 1, 6
+ pavgb m2, m1
+ PALIGNR m6, m1, m0, 1, m7
+ PALIGNR m1, m0, 2, m7
+ LOWPASS 1, 6, 0, 7
+ pavgb m0, m6
+ SBUTTERFLY bw, 2, 3, 6
+ SBUTTERFLY bw, 0, 1, 6
+
+ ; m0, m1, m2, m3, m4, m5
+.loop:
+ sub stride8q, strideq
+ mova [dstq +stride8q+ 0], m3
+ mova [dstq +stride8q+16], m4
+ mova [dst8q +stride8q+ 0], m2
+ mova [dst8q +stride8q+16], m3
+ mova [dst16q+stride8q+ 0], m1
+ mova [dst16q+stride8q+16], m2
+ mova [dst24q+stride8q+ 0], m0
+ mova [dst24q+stride8q+16], m1
+%if cpuflag(avx)
+ palignr m0, m1, m0, 2
+ palignr m1, m2, m1, 2
+ palignr m2, m3, m2, 2
+ palignr m3, m4, m3, 2
+ palignr m4, m5, m4, 2
+ psrldq m5, 2
+%elif cpuflag(ssse3)
+ psrldq m6, m5, 2
+ palignr m5, m4, 2
+ palignr m4, m3, 2
+ palignr m3, m2, 2
+ palignr m2, m1, 2
+ palignr m1, m0, 2
+ mova m0, m1
+ mova m1, m2
+ mova m2, m3
+ mova m3, m4
+ mova m4, m5
+ mova m5, m6
+%else
+ ; sort of a half-integrated version of PALIGNR
+ pslldq m7, m4, 14
+ pslldq m6, m5, 14
+ psrldq m4, 2
+ psrldq m5, 2
+ por m4, m6
+ pslldq m6, m3, 14
+ psrldq m3, 2
+ por m3, m7
+ pslldq m7, m2, 14
+ psrldq m2, 2
+ por m2, m6
+ pslldq m6, m1, 14
+ psrldq m1, 2
+ por m1, m7
+ psrldq m0, 2
+ por m0, m6
+%endif
+ jg .loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+HD_XMM_FUNCS
+INIT_XMM ssse3
+HD_XMM_FUNCS
+INIT_XMM avx
+HD_XMM_FUNCS
+
+%macro HU_MMX_FUNCS 0
+cglobal vp9_ipred_hu_4x4, 3, 3, 0, dst, stride, l
+ movd m0, [lq]
+%if cpuflag(ssse3)
+ pshufb m0, [pb_0to2_5x3]
+%else
+ punpcklbw m1, m0, m0 ; 00112233
+ pshufw m1, m1, q3333 ; 33333333
+ punpckldq m0, m1 ; 01233333
+%endif
+ psrlq m1, m0, 8
+ psrlq m2, m1, 8
+ LOWPASS 2, 1, 0, 3
+ pavgb m1, m0
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ SBUTTERFLY bw, 1, 2, 0
+ PALIGNR m2, m1, 2, m0
+ movd [dstq+strideq*0], m1
+ movd [dstq+strideq*1], m2
+ punpckhdq m1, m1
+ punpckhdq m2, m2
+ movd [dstq+strideq*2], m1
+ movd [dstq+stride3q ], m2
+ RET
+%endmacro
+
+INIT_MMX mmxext
+HU_MMX_FUNCS
+INIT_MMX ssse3
+HU_MMX_FUNCS
+
+%macro HU_XMM_FUNCS 1 ; n_xmm_regs in hu_32x32
+cglobal vp9_ipred_hu_8x8, 3, 4, 4, dst, stride, l
+ movq m0, [lq]
+%if cpuflag(ssse3)
+ pshufb m0, [pb_0to6_9x7]
+%else
+ punpcklbw m1, m0, m0 ; 0011223344556677
+ punpckhwd m1, m1 ; 4444555566667777
+ shufps m0, m1, q3310 ; 0123456777777777
+%endif
+ psrldq m1, m0, 1
+ psrldq m2, m1, 1
+ LOWPASS 2, 1, 0, 3
+ pavgb m1, m0
+ DEFINE_ARGS dst, stride, stride3, dst4
+ lea stride3q, [strideq*3]
+ lea dst4q, [dstq+strideq*4]
+ SBUTTERFLY bw, 1, 2, 0
+ movq [dstq +strideq*0], m1
+ movhps [dst4q+strideq*0], m1
+ PALIGNR m0, m2, m1, 2, m3
+ movq [dstq +strideq*1], m0
+ movhps [dst4q+strideq*1], m0
+ PALIGNR m0, m2, m1, 4, m3
+ movq [dstq +strideq*2], m0
+ movhps [dst4q+strideq*2], m0
+ PALIGNR m2, m1, 6, m3
+ movq [dstq +stride3q ], m2
+ movhps [dst4q+stride3q ], m2
+ RET
+
+cglobal vp9_ipred_hu_16x16, 3, 4, 5, dst, stride, l
+ mova m0, [lq]
+%if cpuflag(ssse3)
+ mova m3, [pb_2toE_3xF]
+ pshufb m1, m0, [pb_1toE_2xF]
+ pshufb m2, m0, m3
+%else
+ pand m3, m0, [pb_15x0_1xm1]
+ psrldq m1, m0, 1
+ por m1, m3
+ punpckhbw m3, m3
+ psrldq m2, m0, 2
+ por m2, m3
+%endif
+ LOWPASS 2, 1, 0, 4
+ pavgb m1, m0
+ DEFINE_ARGS dst, stride, stride9, cnt
+ lea stride9q, [strideq*8+strideq]
+ mov cntd, 4
+ SBUTTERFLY bw, 1, 2, 0
+
+.loop:
+ mova [dstq+strideq*0], m1
+ mova [dstq+strideq*8], m2
+ PALIGNR m0, m2, m1, 2, m4
+%if cpuflag(ssse3)
+ pshufb m2, m3
+%else
+ psrldq m2, 2
+ por m2, m3
+%endif
+ mova [dstq+strideq*1], m0
+ mova [dstq+stride9q ], m2
+ PALIGNR m1, m2, m0, 2, m4
+%if cpuflag(ssse3)
+ pshufb m2, m3
+%else
+ psrldq m2, 2
+ por m2, m3
+%endif
+ lea dstq, [dstq+strideq*2]
+ dec cntd
+ jg .loop
+ RET
+
+cglobal vp9_ipred_hu_32x32, 3, 7, %1, dst, stride, l
+ mova m1, [lq]
+ mova m0, [lq+16]
+ PALIGNR m2, m0, m1, 1, m5
+ PALIGNR m3, m0, m1, 2, m5
+ LOWPASS 3, 2, 1, 5
+ pavgb m2, m1
+%if cpuflag(ssse3)
+ mova m4, [pb_2toE_3xF]
+ pshufb m5, m0, [pb_1toE_2xF]
+ pshufb m1, m0, m4
+%else
+ pand m4, m0, [pb_15x0_1xm1]
+ psrldq m5, m0, 1
+ por m5, m4
+ punpckhbw m4, m4
+ psrldq m1, m0, 2
+ por m1, m4
+%endif
+ LOWPASS 1, 5, 0, 6
+ pavgb m0, m5
+ DEFINE_ARGS dst, stride, cnt, stride0, dst8, dst16, dst24
+ mov cntd, 8
+ xor stride0q, stride0q
+ lea dst8q, [dstq +strideq*8]
+ lea dst16q, [dst8q +strideq*8]
+ lea dst24q, [dst16q+strideq*8]
+ SBUTTERFLY bw, 0, 1, 5
+ SBUTTERFLY bw, 2, 3, 5
+%if cpuflag(ssse3)
+ pshufb m6, m1, [pb_15]
+%else
+ pshufhw m6, m4, q3333
+ punpckhqdq m6, m6
+%endif
+
+.loop:
+ mova [dstq +stride0q+ 0], m2
+ mova [dstq +stride0q+16], m3
+ mova [dst8q +stride0q+ 0], m3
+ mova [dst8q +stride0q+16], m0
+ mova [dst16q+stride0q+ 0], m0
+ mova [dst16q+stride0q+16], m1
+ mova [dst24q+stride0q+ 0], m1
+ mova [dst24q+stride0q+16], m6
+%if cpuflag(avx)
+ palignr m2, m3, m2, 2
+ palignr m3, m0, m3, 2
+ palignr m0, m1, m0, 2
+ pshufb m1, m4
+%elif cpuflag(ssse3)
+ pshufb m5, m1, m4
+ palignr m1, m0, 2
+ palignr m0, m3, 2
+ palignr m3, m2, 2
+ mova m2, m3
+ mova m3, m0
+ mova m0, m1
+ mova m1, m5
+%else
+ ; half-integrated version of PALIGNR
+ pslldq m5, m1, 14
+ pslldq m7, m0, 14
+ psrldq m1, 2
+ psrldq m0, 2
+ por m1, m4
+ por m0, m5
+ pslldq m5, m3, 14
+ psrldq m3, 2
+ por m3, m7
+ psrldq m2, 2
+ por m2, m5
+%endif
+ add stride0q, strideq
+ dec cntd
+ jg .loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+HU_XMM_FUNCS 8
+INIT_XMM ssse3
+HU_XMM_FUNCS 7
+INIT_XMM avx
+HU_XMM_FUNCS 7
+
+; FIXME 127, 128, 129 ?
diff --git a/libs/ffvpx/libavcodec/x86/vp9intrapred_16bpp.asm b/libs/ffvpx/libavcodec/x86/vp9intrapred_16bpp.asm
new file mode 100644
index 000000000..32b698243
--- /dev/null
+++ b/libs/ffvpx/libavcodec/x86/vp9intrapred_16bpp.asm
@@ -0,0 +1,2392 @@
+;******************************************************************************
+;* VP9 Intra prediction SIMD optimizations
+;*
+;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com>
+;* Copyright (c) 2015 Henrik Gramner <henrik gramner com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pd_2: times 8 dd 2
+pd_4: times 8 dd 4
+pd_8: times 8 dd 8
+
+pb_2to15_14_15: db 2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15
+pb_4_5_8to13_8x0: db 4,5,8,9,10,11,12,13,0,0,0,0,0,0,0,0
+pb_0to7_67x4: db 0,1,2,3,4,5,6,7,6,7,6,7,6,7,6,7
+
+cextern pw_1
+cextern pw_1023
+cextern pw_4095
+cextern pd_16
+cextern pd_32
+cextern pd_65535;
+
+; FIXME most top-only functions (ddl, vl, v, dc_top) can be modified to take
+; only 3 registers on x86-32, which would make it one cycle faster, but that
+; would make the code quite a bit uglier...
+
+SECTION .text
+
+%macro SCRATCH 3-4
+%if ARCH_X86_64
+ SWAP %1, %2
+%if %0 == 4
+%define reg_%4 m%2
+%endif
+%else
+ mova [%3], m%1
+%if %0 == 4
+%define reg_%4 [%3]
+%endif
+%endif
+%endmacro
+
+%macro UNSCRATCH 3-4
+%if ARCH_X86_64
+ SWAP %1, %2
+%else
+ mova m%1, [%3]
+%endif
+%if %0 == 4
+%undef reg_%4
+%endif
+%endmacro
+
+%macro PRELOAD 2-3
+%if ARCH_X86_64
+ mova m%1, [%2]
+%if %0 == 3
+%define reg_%3 m%1
+%endif
+%elif %0 == 3
+%define reg_%3 [%2]
+%endif
+%endmacro
+
+INIT_MMX mmx
+cglobal vp9_ipred_v_4x4_16, 2, 4, 1, dst, stride, l, a
+ movifnidn aq, amp
+ mova m0, [aq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ RET
+
+INIT_XMM sse
+cglobal vp9_ipred_v_8x8_16, 2, 4, 1, dst, stride, l, a
+ movifnidn aq, amp
+ mova m0, [aq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ RET
+
+INIT_XMM sse
+cglobal vp9_ipred_v_16x16_16, 2, 4, 2, dst, stride, l, a
+ movifnidn aq, amp
+ mova m0, [aq]
+ mova m1, [aq+mmsize]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ mov cntd, 4
+.loop:
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m1
+ mova [dstq+strideq*1+ 0], m0
+ mova [dstq+strideq*1+16], m1
+ mova [dstq+strideq*2+ 0], m0
+ mova [dstq+strideq*2+16], m1
+ mova [dstq+stride3q + 0], m0
+ mova [dstq+stride3q +16], m1
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jg .loop
+ RET
+
+INIT_XMM sse
+cglobal vp9_ipred_v_32x32_16, 2, 4, 4, dst, stride, l, a
+ movifnidn aq, amp
+ mova m0, [aq+mmsize*0]
+ mova m1, [aq+mmsize*1]
+ mova m2, [aq+mmsize*2]
+ mova m3, [aq+mmsize*3]
+ DEFINE_ARGS dst, stride, cnt
+ mov cntd, 16
+.loop:
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m1
+ mova [dstq+strideq*0+32], m2
+ mova [dstq+strideq*0+48], m3
+ mova [dstq+strideq*1+ 0], m0
+ mova [dstq+strideq*1+16], m1
+ mova [dstq+strideq*1+32], m2
+ mova [dstq+strideq*1+48], m3
+ lea dstq, [dstq+strideq*2]
+ dec cntd
+ jg .loop
+ RET
+
+INIT_MMX mmxext
+cglobal vp9_ipred_h_4x4_16, 3, 3, 4, dst, stride, l, a
+ mova m3, [lq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pshufw m0, m3, q3333
+ pshufw m1, m3, q2222
+ pshufw m2, m3, q1111
+ pshufw m3, m3, q0000
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_h_8x8_16, 3, 3, 4, dst, stride, l, a
+ mova m2, [lq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ punpckhwd m3, m2, m2
+ pshufd m0, m3, q3333
+ pshufd m1, m3, q2222
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ pshufd m0, m3, q1111
+ pshufd m1, m3, q0000
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m1
+ lea dstq, [dstq+strideq*4]
+ punpcklwd m2, m2
+ pshufd m0, m2, q3333
+ pshufd m1, m2, q2222
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ pshufd m0, m2, q1111
+ pshufd m1, m2, q0000
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m1
+ RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_h_16x16_16, 3, 5, 4, dst, stride, l, stride3, cnt
+ mov cntd, 3
+ lea stride3q, [strideq*3]
+.loop:
+ movh m3, [lq+cntq*8]
+ punpcklwd m3, m3
+ pshufd m0, m3, q3333
+ pshufd m1, m3, q2222
+ pshufd m2, m3, q1111
+ pshufd m3, m3, q0000
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m0
+ mova [dstq+strideq*1+ 0], m1
+ mova [dstq+strideq*1+16], m1
+ mova [dstq+strideq*2+ 0], m2
+ mova [dstq+strideq*2+16], m2
+ mova [dstq+stride3q + 0], m3
+ mova [dstq+stride3q +16], m3
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jge .loop
+ RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_h_32x32_16, 3, 5, 4, dst, stride, l, stride3, cnt
+ mov cntd, 7
+ lea stride3q, [strideq*3]
+.loop:
+ movh m3, [lq+cntq*8]
+ punpcklwd m3, m3
+ pshufd m0, m3, q3333
+ pshufd m1, m3, q2222
+ pshufd m2, m3, q1111
+ pshufd m3, m3, q0000
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m0
+ mova [dstq+strideq*0+32], m0
+ mova [dstq+strideq*0+48], m0
+ mova [dstq+strideq*1+ 0], m1
+ mova [dstq+strideq*1+16], m1
+ mova [dstq+strideq*1+32], m1
+ mova [dstq+strideq*1+48], m1
+ mova [dstq+strideq*2+ 0], m2
+ mova [dstq+strideq*2+16], m2
+ mova [dstq+strideq*2+32], m2
+ mova [dstq+strideq*2+48], m2
+ mova [dstq+stride3q + 0], m3
+ mova [dstq+stride3q +16], m3
+ mova [dstq+stride3q +32], m3
+ mova [dstq+stride3q +48], m3
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jge .loop
+ RET
+
+INIT_MMX mmxext
+cglobal vp9_ipred_dc_4x4_16, 4, 4, 2, dst, stride, l, a
+ mova m0, [lq]
+ paddw m0, [aq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pmaddwd m0, [pw_1]
+ pshufw m1, m0, q3232
+ paddd m0, [pd_4]
+ paddd m0, m1
+ psrad m0, 3
+ pshufw m0, m0, q0000
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_8x8_16, 4, 4, 2, dst, stride, l, a
+ mova m0, [lq]
+ paddw m0, [aq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pmaddwd m0, [pw_1]
+ pshufd m1, m0, q3232
+ paddd m0, m1
+ pshufd m1, m0, q1111
+ paddd m0, [pd_8]
+ paddd m0, m1
+ psrad m0, 4
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_16x16_16, 4, 4, 2, dst, stride, l, a
+ mova m0, [lq]
+ paddw m0, [lq+mmsize]
+ paddw m0, [aq]
+ paddw m0, [aq+mmsize]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ mov cntd, 4
+ pmaddwd m0, [pw_1]
+ pshufd m1, m0, q3232
+ paddd m0, m1
+ pshufd m1, m0, q1111
+ paddd m0, [pd_16]
+ paddd m0, m1
+ psrad m0, 5
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.loop:
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m0
+ mova [dstq+strideq*1+ 0], m0
+ mova [dstq+strideq*1+16], m0
+ mova [dstq+strideq*2+ 0], m0
+ mova [dstq+strideq*2+16], m0
+ mova [dstq+stride3q + 0], m0
+ mova [dstq+stride3q +16], m0
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jg .loop
+ RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_32x32_16, 4, 4, 2, dst, stride, l, a
+ mova m0, [lq+mmsize*0]
+ paddw m0, [lq+mmsize*1]
+ paddw m0, [lq+mmsize*2]
+ paddw m0, [lq+mmsize*3]
+ paddw m0, [aq+mmsize*0]
+ paddw m0, [aq+mmsize*1]
+ paddw m0, [aq+mmsize*2]
+ paddw m0, [aq+mmsize*3]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ mov cntd, 16
+ pmaddwd m0, [pw_1]
+ pshufd m1, m0, q3232
+ paddd m0, m1
+ pshufd m1, m0, q1111
+ paddd m0, [pd_32]
+ paddd m0, m1
+ psrad m0, 6
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.loop:
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m0
+ mova [dstq+strideq*0+32], m0
+ mova [dstq+strideq*0+48], m0
+ mova [dstq+strideq*1+ 0], m0
+ mova [dstq+strideq*1+16], m0
+ mova [dstq+strideq*1+32], m0
+ mova [dstq+strideq*1+48], m0
+ lea dstq, [dstq+strideq*2]
+ dec cntd
+ jg .loop
+ RET
+
+%macro DC_1D_FNS 2
+INIT_MMX mmxext
+cglobal vp9_ipred_dc_%1_4x4_16, 4, 4, 2, dst, stride, l, a
+ mova m0, [%2]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pmaddwd m0, [pw_1]
+ pshufw m1, m0, q3232
+ paddd m0, [pd_2]
+ paddd m0, m1
+ psrad m0, 2
+ pshufw m0, m0, q0000
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_%1_8x8_16, 4, 4, 2, dst, stride, l, a
+ mova m0, [%2]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pmaddwd m0, [pw_1]
+ pshufd m1, m0, q3232
+ paddd m0, m1
+ pshufd m1, m0, q1111
+ paddd m0, [pd_4]
+ paddd m0, m1
+ psrad m0, 3
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_%1_16x16_16, 4, 4, 2, dst, stride, l, a
+ mova m0, [%2]
+ paddw m0, [%2+mmsize]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ mov cntd, 4
+ pmaddwd m0, [pw_1]
+ pshufd m1, m0, q3232
+ paddd m0, m1
+ pshufd m1, m0, q1111
+ paddd m0, [pd_8]
+ paddd m0, m1
+ psrad m0, 4
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.loop:
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m0
+ mova [dstq+strideq*1+ 0], m0
+ mova [dstq+strideq*1+16], m0
+ mova [dstq+strideq*2+ 0], m0
+ mova [dstq+strideq*2+16], m0
+ mova [dstq+stride3q + 0], m0
+ mova [dstq+stride3q +16], m0
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jg .loop
+ RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_%1_32x32_16, 4, 4, 2, dst, stride, l, a
+ mova m0, [%2+mmsize*0]
+ paddw m0, [%2+mmsize*1]
+ paddw m0, [%2+mmsize*2]
+ paddw m0, [%2+mmsize*3]
+ DEFINE_ARGS dst, stride, cnt
+ mov cntd, 16
+ pmaddwd m0, [pw_1]
+ pshufd m1, m0, q3232
+ paddd m0, m1
+ pshufd m1, m0, q1111
+ paddd m0, [pd_16]
+ paddd m0, m1
+ psrad m0, 5
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.loop:
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m0
+ mova [dstq+strideq*0+32], m0
+ mova [dstq+strideq*0+48], m0
+ mova [dstq+strideq*1+ 0], m0
+ mova [dstq+strideq*1+16], m0
+ mova [dstq+strideq*1+32], m0
+ mova [dstq+strideq*1+48], m0
+ lea dstq, [dstq+strideq*2]
+ dec cntd
+ jg .loop
+ RET
+%endmacro
+
+DC_1D_FNS top, aq
+DC_1D_FNS left, lq
+
+INIT_MMX mmxext
+cglobal vp9_ipred_tm_4x4_10, 4, 4, 6, dst, stride, l, a
+ mova m5, [pw_1023]
+.body:
+ mova m4, [aq]
+ mova m3, [lq]
+ movd m0, [aq-4]
+ pshufw m0, m0, q1111
+ psubw m4, m0
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pshufw m0, m3, q3333
+ pshufw m1, m3, q2222
+ pshufw m2, m3, q1111
+ pshufw m3, m3, q0000
+ paddw m0, m4
+ paddw m1, m4
+ paddw m2, m4
+ paddw m3, m4
+ pxor m4, m4
+ pmaxsw m0, m4
+ pmaxsw m1, m4
+ pmaxsw m2, m4
+ pmaxsw m3, m4
+ pminsw m0, m5
+ pminsw m1, m5
+ pminsw m2, m5
+ pminsw m3, m5
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ RET
+
+cglobal vp9_ipred_tm_4x4_12, 4, 4, 6, dst, stride, l, a
+ mova m5, [pw_4095]
+ jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_4x4_10 %+ SUFFIX).body
+
+INIT_XMM sse2
+cglobal vp9_ipred_tm_8x8_10, 4, 5, 7, dst, stride, l, a
+ mova m4, [pw_1023]
+.body:
+ pxor m6, m6
+ mova m5, [aq]
+ movd m0, [aq-4]
+ pshuflw m0, m0, q1111
+ punpcklqdq m0, m0
+ psubw m5, m0
+ DEFINE_ARGS dst, stride, l, stride3, cnt
+ lea stride3q, [strideq*3]
+ mov cntd, 1
+.loop:
+ movh m3, [lq+cntq*8]
+ punpcklwd m3, m3
+ pshufd m0, m3, q3333
+ pshufd m1, m3, q2222
+ pshufd m2, m3, q1111
+ pshufd m3, m3, q0000
+ paddw m0, m5
+ paddw m1, m5
+ paddw m2, m5
+ paddw m3, m5
+ pmaxsw m0, m6
+ pmaxsw m1, m6
+ pmaxsw m2, m6
+ pmaxsw m3, m6
+ pminsw m0, m4
+ pminsw m1, m4
+ pminsw m2, m4
+ pminsw m3, m4
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jge .loop
+ RET
+
+cglobal vp9_ipred_tm_8x8_12, 4, 5, 7, dst, stride, l, a
+ mova m4, [pw_4095]
+ jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_8x8_10 %+ SUFFIX).body
+
+INIT_XMM sse2
+cglobal vp9_ipred_tm_16x16_10, 4, 4, 8, dst, stride, l, a
+ mova m7, [pw_1023]
+.body:
+ pxor m6, m6
+ mova m4, [aq]
+ mova m5, [aq+mmsize]
+ movd m0, [aq-4]
+ pshuflw m0, m0, q1111
+ punpcklqdq m0, m0
+ psubw m4, m0
+ psubw m5, m0
+ DEFINE_ARGS dst, stride, l, cnt
+ mov cntd, 7
+.loop:
+ movd m3, [lq+cntq*4]
+ punpcklwd m3, m3
+ pshufd m2, m3, q1111
+ pshufd m3, m3, q0000
+ paddw m0, m2, m4
+ paddw m2, m5
+ paddw m1, m3, m4
+ paddw m3, m5
+ pmaxsw m0, m6
+ pmaxsw m2, m6
+ pmaxsw m1, m6
+ pmaxsw m3, m6
+ pminsw m0, m7
+ pminsw m2, m7
+ pminsw m1, m7
+ pminsw m3, m7
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m2
+ mova [dstq+strideq*1+ 0], m1
+ mova [dstq+strideq*1+16], m3
+ lea dstq, [dstq+strideq*2]
+ dec cntd
+ jge .loop
+ RET
+
+cglobal vp9_ipred_tm_16x16_12, 4, 4, 8, dst, stride, l, a
+ mova m7, [pw_4095]
+ jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_16x16_10 %+ SUFFIX).body
+
+INIT_XMM sse2
+cglobal vp9_ipred_tm_32x32_10, 4, 4, 10, 32 * -ARCH_X86_32, dst, stride, l, a
+ mova m0, [pw_1023]
+.body:
+ pxor m1, m1
+%if ARCH_X86_64
+ SWAP 0, 8
+ SWAP 1, 9
+%define reg_min m9
+%define reg_max m8
+%else
+ mova [rsp+ 0], m0
+ mova [rsp+16], m1
+%define reg_min [rsp+16]
+%define reg_max [rsp+ 0]
+%endif
+
+ mova m4, [aq+mmsize*0]
+ mova m5, [aq+mmsize*1]
+ mova m6, [aq+mmsize*2]
+ mova m7, [aq+mmsize*3]
+ movd m0, [aq-4]
+ pshuflw m0, m0, q1111
+ punpcklqdq m0, m0
+ psubw m4, m0
+ psubw m5, m0
+ psubw m6, m0
+ psubw m7, m0
+ DEFINE_ARGS dst, stride, l, cnt
+ mov cntd, 31
+.loop:
+ pinsrw m3, [lq+cntq*2], 0
+ punpcklwd m3, m3
+ pshufd m3, m3, q0000
+ paddw m0, m3, m4
+ paddw m1, m3, m5
+ paddw m2, m3, m6
+ paddw m3, m7
+ pmaxsw m0, reg_min
+ pmaxsw m1, reg_min
+ pmaxsw m2, reg_min
+ pmaxsw m3, reg_min
+ pminsw m0, reg_max
+ pminsw m1, reg_max
+ pminsw m2, reg_max
+ pminsw m3, reg_max
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m1
+ mova [dstq+strideq*0+32], m2
+ mova [dstq+strideq*0+48], m3
+ add dstq, strideq
+ dec cntd
+ jge .loop
+ RET
+
+cglobal vp9_ipred_tm_32x32_12, 4, 4, 10, 32 * -ARCH_X86_32, dst, stride, l, a
+ mova m0, [pw_4095]
+ jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_32x32_10 %+ SUFFIX).body
+
+; Directional intra predicion functions
+;
+; in the functions below, 'abcdefgh' refers to above data (sometimes simply
+; abbreviated as a[N-M]). 'stuvwxyz' refers to left data (sometimes simply
+; abbreviated as l[N-M]). * is top-left data. ABCDEFG or A[N-M] is filtered
+; above data, STUVWXYZ or L[N-M] is filtered left data, and # is filtered
+; top-left data.
+
+; left=(left+2*center+right+2)>>2
+%macro LOWPASS 3 ; left [dst], center, right
+ paddw m%1, m%3
+ psraw m%1, 1
+ pavgw m%1, m%2
+%endmacro
+
+; abcdefgh (src) -> bcdefghh (dst)
+; dst/src can be the same register
+%macro SHIFT_RIGHT 2-3 [pb_2to15_14_15] ; dst, src, [ssse3_shift_reg]
+%if cpuflag(ssse3)
+ pshufb %1, %2, %3 ; abcdefgh -> bcdefghh
+%else
+ psrldq %1, %2, 2 ; abcdefgh -> bcdefgh.
+ pshufhw %1, %1, q2210 ; bcdefgh. -> bcdefghh
+%endif
+%endmacro
+
+; abcdefgh (src) -> bcdefghh (dst1) and cdefghhh (dst2)
+%macro SHIFT_RIGHTx2 3-4 [pb_2to15_14_15] ; dst1, dst2, src, [ssse3_shift_reg]
+%if cpuflag(ssse3)
+ pshufb %1, %3, %4 ; abcdefgh -> bcdefghh
+ pshufb %2, %1, %4 ; bcdefghh -> cdefghhh
+%else
+ psrldq %1, %3, 2 ; abcdefgh -> bcdefgh.
+ psrldq %2, %3, 4 ; abcdefgh -> cdefgh..
+ pshufhw %1, %1, q2210 ; bcdefgh. -> bcdefghh
+ pshufhw %2, %2, q1110 ; cdefgh.. -> cdefghhh
+%endif
+%endmacro
+
+%macro DL_FUNCS 0
+cglobal vp9_ipred_dl_4x4_16, 2, 4, 3, dst, stride, l, a
+ movifnidn aq, amp
+ movu m1, [aq] ; abcdefgh
+ pshufhw m0, m1, q3310 ; abcdefhh
+ SHIFT_RIGHT m1, m1 ; bcdefghh
+ psrldq m2, m1, 2 ; cdefghh.
+ LOWPASS 0, 1, 2 ; BCDEFGh.
+ pshufd m1, m0, q3321 ; DEFGh...
+ movh [dstq+strideq*0], m0
+ movh [dstq+strideq*2], m1
+ add dstq, strideq
+ psrldq m0, 2 ; CDEFGh..
+ psrldq m1, 2 ; EFGh....
+ movh [dstq+strideq*0], m0
+ movh [dstq+strideq*2], m1
+ RET
+
+cglobal vp9_ipred_dl_8x8_16, 2, 4, 5, dst, stride, l, a
+ movifnidn aq, amp
+ mova m0, [aq] ; abcdefgh
+%if cpuflag(ssse3)
+ mova m4, [pb_2to15_14_15]
+%endif
+ SHIFT_RIGHTx2 m1, m2, m0, m4 ; bcdefghh/cdefghhh
+ LOWPASS 0, 1, 2 ; BCDEFGHh
+ shufps m1, m0, m2, q3332 ; FGHhhhhh
+ shufps m3, m0, m1, q2121 ; DEFGHhhh
+ DEFINE_ARGS dst, stride, stride5
+ lea stride5q, [strideq*5]
+
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*4], m1
+ SHIFT_RIGHT m0, m0, m4 ; CDEFGHhh
+ pshuflw m1, m1, q3321 ; GHhhhhhh
+ pshufd m2, m0, q3321 ; EFGHhhhh
+ mova [dstq+strideq*1], m0
+ mova [dstq+stride5q ], m1
+ lea dstq, [dstq+strideq*2]
+ pshuflw m1, m1, q3321 ; Hhhhhhhh
+ mova [dstq+strideq*0], m3
+ mova [dstq+strideq*4], m1
+ pshuflw m1, m1, q3321 ; hhhhhhhh
+ mova [dstq+strideq*1], m2
+ mova [dstq+stride5q ], m1
+ RET
+
+cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a
+ movifnidn aq, amp
+ mova m0, [aq] ; abcdefgh
+ mova m3, [aq+mmsize] ; ijklmnop
+ PALIGNR m1, m3, m0, 2, m4 ; bcdefghi
+ PALIGNR m2, m3, m0, 4, m4 ; cdefghij
+ LOWPASS 0, 1, 2 ; BCDEFGHI
+%if cpuflag(ssse3)
+ mova m4, [pb_2to15_14_15]
+%endif
+ SHIFT_RIGHTx2 m2, m1, m3, m4 ; jklmnopp/klmnoppp
+ LOWPASS 1, 2, 3 ; JKLMNOPp
+ pshufd m2, m2, q3333 ; pppppppp
+ DEFINE_ARGS dst, stride, cnt
+ mov cntd, 8
+
+.loop:
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m1
+ mova [dstq+strideq*8+ 0], m1
+ mova [dstq+strideq*8+16], m2
+ add dstq, strideq
+%if cpuflag(avx)
+ vpalignr m0, m1, m0, 2
+%else
+ PALIGNR m3, m1, m0, 2, m4
+ mova m0, m3
+%endif
+ SHIFT_RIGHT m1, m1, m4
+ dec cntd
+ jg .loop
+ RET
+
+cglobal vp9_ipred_dl_32x32_16, 2, 5, 7, dst, stride, l, a
+ movifnidn aq, amp
+ mova m0, [aq+mmsize*0] ; abcdefgh
+ mova m1, [aq+mmsize*1] ; ijklmnop
+ mova m2, [aq+mmsize*2] ; qrstuvwx
+ mova m3, [aq+mmsize*3] ; yz012345
+ PALIGNR m4, m1, m0, 2, m6
+ PALIGNR m5, m1, m0, 4, m6
+ LOWPASS 0, 4, 5 ; BCDEFGHI
+ PALIGNR m4, m2, m1, 2, m6
+ PALIGNR m5, m2, m1, 4, m6
+ LOWPASS 1, 4, 5 ; JKLMNOPQ
+ PALIGNR m4, m3, m2, 2, m6
+ PALIGNR m5, m3, m2, 4, m6
+ LOWPASS 2, 4, 5 ; RSTUVWXY
+%if cpuflag(ssse3)
+ mova m6, [pb_2to15_14_15]
+%endif
+ SHIFT_RIGHTx2 m4, m5, m3, m6
+ LOWPASS 3, 4, 5 ; Z0123455
+ pshufd m4, m4, q3333 ; 55555555
+ DEFINE_ARGS dst, stride, stride8, stride24, cnt
+ mov cntd, 8
+ lea stride8q, [strideq*8]
+ lea stride24q, [stride8q*3]
+
+.loop:
+ mova [dstq+stride8q*0+ 0], m0
+ mova [dstq+stride8q*0+16], m1
+ mova [dstq+stride8q*0+32], m2
+ mova [dstq+stride8q*0+48], m3
+ mova [dstq+stride8q*1+ 0], m1
+ mova [dstq+stride8q*1+16], m2
+ mova [dstq+stride8q*1+32], m3
+ mova [dstq+stride8q*1+48], m4
+ mova [dstq+stride8q*2+ 0], m2
+ mova [dstq+stride8q*2+16], m3
+ mova [dstq+stride8q*2+32], m4
+ mova [dstq+stride8q*2+48], m4
+ mova [dstq+stride24q + 0], m3
+ mova [dstq+stride24q +16], m4
+ mova [dstq+stride24q +32], m4
+ mova [dstq+stride24q +48], m4
+ add dstq, strideq
+%if cpuflag(avx)
+ vpalignr m0, m1, m0, 2
+ vpalignr m1, m2, m1, 2
+ vpalignr m2, m3, m2, 2
+%else
+ PALIGNR m5, m1, m0, 2, m6
+ mova m0, m5
+ PALIGNR m5, m2, m1, 2, m6
+ mova m1, m5
+ PALIGNR m5, m3, m2, 2, m6
+ mova m2, m5
+%endif
+ SHIFT_RIGHT m3, m3, m6
+ dec cntd
+ jg .loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+DL_FUNCS
+INIT_XMM ssse3
+DL_FUNCS
+INIT_XMM avx
+DL_FUNCS
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a
+ movifnidn aq, amp
+ mova m0, [aq] ; abcdefghijklmnop
+ vpbroadcastw xm1, [aq+30] ; pppppppp
+ vperm2i128 m2, m0, m1, q0201 ; ijklmnoppppppppp
+ vpalignr m3, m2, m0, 2 ; bcdefghijklmnopp
+ vpalignr m4, m2, m0, 4 ; cdefghijklmnoppp
+ LOWPASS 0, 3, 4 ; BCDEFGHIJKLMNOPp
+ vperm2i128 m2, m0, m1, q0201 ; JKLMNOPppppppppp
+ DEFINE_ARGS dst, stride, stride3, cnt
+ mov cntd, 2
+ lea stride3q, [strideq*3]
+
+.loop:
+ mova [dstq+strideq*0], m0
+ vpalignr m3, m2, m0, 2
+ vpalignr m4, m2, m0, 4
+ mova [dstq+strideq*1], m3
+ mova [dstq+strideq*2], m4
+ vpalignr m3, m2, m0, 6
+ vpalignr m4, m2, m0, 8
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], m4
+ vpalignr m3, m2, m0, 10
+ vpalignr m4, m2, m0, 12
+ mova [dstq+strideq*1], m3
+ mova [dstq+strideq*2], m4
+ vpalignr m3, m2, m0, 14
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ mova m0, m2
+ vperm2i128 m2, m2, m2, q0101 ; pppppppppppppppp
+ dec cntd
+ jg .loop
+ RET
+
+cglobal vp9_ipred_dl_32x32_16, 2, 6, 7, dst, stride, l, a
+ movifnidn aq, amp
+ mova m0, [aq+mmsize*0+ 0] ; abcdefghijklmnop
+ mova m1, [aq+mmsize*1+ 0] ; qrstuvwxyz012345
+ vpbroadcastw xm4, [aq+mmsize*1+30] ; 55555555
+ vperm2i128 m5, m0, m1, q0201 ; ijklmnopqrstuvwx
+ vpalignr m2, m5, m0, 2 ; bcdefghijklmnopq
+ vpalignr m3, m5, m0, 4 ; cdefghijklmnopqr
+ LOWPASS 0, 2, 3 ; BCDEFGHIJKLMNOPQ
+ vperm2i128 m5, m1, m4, q0201 ; yz01234555555555
+ vpalignr m2, m5, m1, 2 ; rstuvwxyz0123455
+ vpalignr m3, m5, m1, 4 ; stuvwxyz01234555
+ LOWPASS 1, 2, 3 ; RSTUVWXYZ......5
+ vperm2i128 m2, m1, m4, q0201 ; Z......555555555
+ vperm2i128 m5, m0, m1, q0201 ; JKLMNOPQRSTUVWXY
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ mov cntd, 4
+
+.loop:
+ mova [dstq+strideq*0 + 0], m0
+ mova [dstq+strideq*0 +32], m1
+ vpalignr m3, m5, m0, 2
+ vpalignr m4, m2, m1, 2
+ mova [dstq+strideq*1 + 0], m3
+ mova [dstq+strideq*1 +32], m4
+ vpalignr m3, m5, m0, 4
+ vpalignr m4, m2, m1, 4
+ mova [dstq+strideq*2 + 0], m3
+ mova [dstq+strideq*2 +32], m4
+ vpalignr m3, m5, m0, 6
+ vpalignr m4, m2, m1, 6
+ mova [dstq+stride3q*1+ 0], m3
+ mova [dstq+stride3q*1+32], m4
+ lea dstq, [dstq+strideq*4]
+ vpalignr m3, m5, m0, 8
+ vpalignr m4, m2, m1, 8
+ mova [dstq+strideq*0 + 0], m3
+ mova [dstq+strideq*0 +32], m4
+ vpalignr m3, m5, m0, 10
+ vpalignr m4, m2, m1, 10
+ mova [dstq+strideq*1 + 0], m3
+ mova [dstq+strideq*1 +32], m4
+ vpalignr m3, m5, m0, 12
+ vpalignr m4, m2, m1, 12
+ mova [dstq+strideq*2+ 0], m3
+ mova [dstq+strideq*2+32], m4
+ vpalignr m3, m5, m0, 14
+ vpalignr m4, m2, m1, 14
+ mova [dstq+stride3q+ 0], m3
+ mova [dstq+stride3q+ 32], m4
+ vpalignr m3, m5, m0, 16
+ vpalignr m4, m2, m1, 16
+ vperm2i128 m5, m3, m4, q0201
+ vperm2i128 m2, m4, m4, q0101
+ mova m0, m3
+ mova m1, m4
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jg .loop
+ RET
+%endif
+
+%macro DR_FUNCS 1 ; stack_mem_for_32x32_32bit_function
+cglobal vp9_ipred_dr_4x4_16, 4, 4, 3, dst, stride, l, a
+ movh m0, [lq] ; wxyz....
+ movhps m0, [aq-2] ; wxyz*abc
+ movd m1, [aq+6] ; d.......
+ PALIGNR m1, m0, 2, m2 ; xyz*abcd
+ psrldq m2, m1, 2 ; yz*abcd.
+ LOWPASS 0, 1, 2 ; XYZ#ABC.
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+
+ movh [dstq+stride3q ], m0
+ psrldq m0, 2 ; YZ#ABC..
+ movh [dstq+strideq*2], m0
+ psrldq m0, 2 ; Z#ABC...
+ movh [dstq+strideq*1], m0
+ psrldq m0, 2 ; #ABC....
+ movh [dstq+strideq*0], m0
+ RET
+
+cglobal vp9_ipred_dr_8x8_16, 4, 4, 5, dst, stride, l, a
+ mova m0, [lq] ; stuvwxyz
+ movu m1, [aq-2] ; *abcdefg
+ mova m2, [aq] ; abcdefgh
+ psrldq m3, m2, 2 ; bcdefgh.
+ LOWPASS 3, 2, 1 ; ABCDEFG.
+ PALIGNR m1, m0, 2, m4 ; tuvwxyz*
+ PALIGNR m2, m1, 2, m4 ; uvwxyz*a
+ LOWPASS 2, 1, 0 ; TUVWXYZ#
+ DEFINE_ARGS dst, stride, dst4, stride3
+ lea stride3q, [strideq*3]
+ lea dst4q, [dstq+strideq*4]
+
+ movhps [dstq +stride3q +0], m2
+ movh [dstq+ stride3q +8], m3
+ mova [dst4q+stride3q +0], m2
+ PALIGNR m1, m3, m2, 2, m0
+ psrldq m3, 2
+ movhps [dstq +strideq*2+0], m1
+ movh [dstq+ strideq*2+8], m3
+ mova [dst4q+strideq*2+0], m1
+ PALIGNR m2, m3, m1, 2, m0
+ psrldq m3, 2
+ movhps [dstq +strideq*1+0], m2
+ movh [dstq+ strideq*1+8], m3
+ mova [dst4q+strideq*1+0], m2
+ PALIGNR m1, m3, m2, 2, m0
+ psrldq m3, 2
+ movhps [dstq +strideq*0+0], m1
+ movh [dstq+ strideq*0+8], m3
+ mova [dst4q+strideq*0+0], m1
+ RET
+
+cglobal vp9_ipred_dr_16x16_16, 4, 4, 7, dst, stride, l, a
+ mova m0, [lq] ; klmnopqr
+ mova m1, [lq+mmsize] ; stuvwxyz
+ movu m2, [aq-2] ; *abcdefg
+ movu m3, [aq+mmsize-2] ; hijklmno
+ mova m4, [aq] ; abcdefgh
+ mova m5, [aq+mmsize] ; ijklmnop
+ psrldq m6, m5, 2 ; jklmnop.
+ LOWPASS 6, 5, 3 ; IJKLMNO.
+ PALIGNR m5, m4, 2, m3 ; bcdefghi
+ LOWPASS 5, 4, 2 ; ABCDEFGH
+ PALIGNR m2, m1, 2, m3 ; tuvwxyz*
+ PALIGNR m4, m2, 2, m3 ; uvwxyz*a
+ LOWPASS 4, 2, 1 ; TUVWXYZ#
+ PALIGNR m1, m0, 2, m3 ; lmnopqrs
+ PALIGNR m2, m1, 2, m3 ; mnopqrst
+ LOWPASS 2, 1, 0 ; LMNOPQRS
+ DEFINE_ARGS dst, stride, dst8, cnt
+ lea dst8q, [dstq+strideq*8]
+ mov cntd, 8
+
+.loop:
+ sub dst8q, strideq
+ mova [dst8q+strideq*0+ 0], m4
+ mova [dst8q+strideq*0+16], m5
+ mova [dst8q+strideq*8+ 0], m2
+ mova [dst8q+strideq*8+16], m4
+%if cpuflag(avx)
+ vpalignr m2, m4, m2, 2
+ vpalignr m4, m5, m4, 2
+ vpalignr m5, m6, m5, 2
+%else
+ PALIGNR m0, m4, m2, 2, m1
+ mova m2, m0
+ PALIGNR m0, m5, m4, 2, m1
+ mova m4, m0
+ PALIGNR m0, m6, m5, 2, m1
+ mova m5, m0
+%endif
+ psrldq m6, 2
+ dec cntd
+ jg .loop
+ RET
+
+cglobal vp9_ipred_dr_32x32_16, 4, 5, 10 + notcpuflag(ssse3), \
+ %1 * ARCH_X86_32 * -mmsize, dst, stride, l, a
+ mova m0, [aq+mmsize*3] ; a[24-31]
+ movu m1, [aq+mmsize*3-2] ; a[23-30]
+ psrldq m2, m0, 2 ; a[25-31].
+ LOWPASS 2, 0, 1 ; A[24-30].
+ mova m1, [aq+mmsize*2] ; a[16-23]
+ movu m3, [aq+mmsize*2-2] ; a[15-22]
+ PALIGNR m0, m1, 2, m4 ; a[17-24]
+ LOWPASS 0, 1, 3 ; A[16-23]
+ mova m3, [aq+mmsize*1] ; a[8-15]
+ movu m4, [aq+mmsize*1-2] ; a[7-14]
+ PALIGNR m1, m3, 2, m5 ; a[9-16]
+ LOWPASS 1, 3, 4 ; A[8-15]
+ mova m4, [aq+mmsize*0] ; a[0-7]
+ movu m5, [aq+mmsize*0-2] ; *a[0-6]
+ PALIGNR m3, m4, 2, m6 ; a[1-8]
+ LOWPASS 3, 4, 5 ; A[0-7]
+ SCRATCH 1, 8, rsp+0*mmsize
+ SCRATCH 3, 9, rsp+1*mmsize
+%if notcpuflag(ssse3)
+ SCRATCH 0, 10, rsp+2*mmsize
+%endif
+ mova m6, [lq+mmsize*3] ; l[24-31]
+ PALIGNR m5, m6, 2, m0 ; l[25-31]*
+ PALIGNR m4, m5, 2, m0 ; l[26-31]*a
+ LOWPASS 4, 5, 6 ; L[25-31]#
+ mova m7, [lq+mmsize*2] ; l[16-23]
+ PALIGNR m6, m7, 2, m0 ; l[17-24]
+ PALIGNR m5, m6, 2, m0 ; l[18-25]
+ LOWPASS 5, 6, 7 ; L[17-24]
+ mova m1, [lq+mmsize*1] ; l[8-15]
+ PALIGNR m7, m1, 2, m0 ; l[9-16]
+ PALIGNR m6, m7, 2, m0 ; l[10-17]
+ LOWPASS 6, 7, 1 ; L[9-16]
+ mova m3, [lq+mmsize*0] ; l[0-7]
+ PALIGNR m1, m3, 2, m0 ; l[1-8]
+ PALIGNR m7, m1, 2, m0 ; l[2-9]
+ LOWPASS 7, 1, 3 ; L[1-8]
+%if cpuflag(ssse3)
+%if cpuflag(avx)
+ UNSCRATCH 1, 8, rsp+0*mmsize
+%endif
+ UNSCRATCH 3, 9, rsp+1*mmsize
+%else
+ UNSCRATCH 0, 10, rsp+2*mmsize
+%endif
+ DEFINE_ARGS dst8, stride, stride8, stride24, cnt
+ lea stride8q, [strideq*8]
+ lea stride24q, [stride8q*3]
+ lea dst8q, [dst8q+strideq*8]
+ mov cntd, 8
+
+.loop:
+ sub dst8q, strideq
+%if notcpuflag(avx)
+ UNSCRATCH 1, 8, rsp+0*mmsize
+%if notcpuflag(ssse3)
+ UNSCRATCH 3, 9, rsp+1*mmsize
+%endif
+%endif
+ mova [dst8q+stride8q*0+ 0], m4
+ mova [dst8q+stride8q*0+16], m3
+ mova [dst8q+stride8q*0+32], m1
+ mova [dst8q+stride8q*0+48], m0
+ mova [dst8q+stride8q*1+ 0], m5
+ mova [dst8q+stride8q*1+16], m4
+ mova [dst8q+stride8q*1+32], m3
+ mova [dst8q+stride8q*1+48], m1
+ mova [dst8q+stride8q*2+ 0], m6
+ mova [dst8q+stride8q*2+16], m5
+ mova [dst8q+stride8q*2+32], m4
+ mova [dst8q+stride8q*2+48], m3
+ mova [dst8q+stride24q + 0], m7
+ mova [dst8q+stride24q +16], m6
+ mova [dst8q+stride24q +32], m5
+ mova [dst8q+stride24q +48], m4
+%if cpuflag(avx)
+ vpalignr m7, m6, m7, 2
+ vpalignr m6, m5, m6, 2
+ vpalignr m5, m4, m5, 2
+ vpalignr m4, m3, m4, 2
+ vpalignr m3, m1, m3, 2
+ vpalignr m1, m0, m1, 2
+ vpalignr m0, m2, m0, 2
+%else
+ SCRATCH 2, 8, rsp+0*mmsize
+%if notcpuflag(ssse3)
+ SCRATCH 0, 9, rsp+1*mmsize
+%endif
+ PALIGNR m2, m6, m7, 2, m0
+ mova m7, m2
+ PALIGNR m2, m5, m6, 2, m0
+ mova m6, m2
+ PALIGNR m2, m4, m5, 2, m0
+ mova m5, m2
+ PALIGNR m2, m3, m4, 2, m0
+ mova m4, m2
+ PALIGNR m2, m1, m3, 2, m0
+ mova m3, m2
+%if notcpuflag(ssse3)
+ UNSCRATCH 0, 9, rsp+1*mmsize
+ SCRATCH 3, 9, rsp+1*mmsize
+%endif
+ PALIGNR m2, m0, m1, 2, m3
+ mova m1, m2
+ UNSCRATCH 2, 8, rsp+0*mmsize
+ SCRATCH 1, 8, rsp+0*mmsize
+ PALIGNR m1, m2, m0, 2, m3
+ mova m0, m1
+%endif
+ psrldq m2, 2
+ dec cntd
+ jg .loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+DR_FUNCS 3
+INIT_XMM ssse3
+DR_FUNCS 2
+INIT_XMM avx
+DR_FUNCS 2
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_ipred_dr_16x16_16, 4, 5, 6, dst, stride, l, a
+ mova m0, [lq] ; klmnopqrstuvwxyz
+ movu m1, [aq-2] ; *abcdefghijklmno
+ mova m2, [aq] ; abcdefghijklmnop
+ vperm2i128 m4, m2, m2, q2001 ; ijklmnop........
+ vpalignr m5, m4, m2, 2 ; bcdefghijklmnop.
+ vperm2i128 m3, m0, m1, q0201 ; stuvwxyz*abcdefg
+ LOWPASS 1, 2, 5 ; ABCDEFGHIJKLMNO.
+ vpalignr m4, m3, m0, 2 ; lmnopqrstuvwxyz*
+ vpalignr m5, m3, m0, 4 ; mnopqrstuvwxyz*a
+ LOWPASS 0, 4, 5 ; LMNOPQRSTUVWXYZ#
+ vperm2i128 m5, m0, m1, q0201 ; TUVWXYZ#ABCDEFGH
+ DEFINE_ARGS dst, stride, stride3, stride5, dst3
+ lea dst3q, [dstq+strideq*4]
+ lea stride3q, [strideq*3]
+ lea stride5q, [stride3q+strideq*2]
+
+ vpalignr m3, m5, m0, 2
+ vpalignr m4, m1, m5, 2
+ mova [dst3q+stride5q*2], m3 ; 14
+ mova [ dstq+stride3q*2], m4 ; 6
+ vpalignr m3, m5, m0, 4
+ vpalignr m4, m1, m5, 4
+ sub dst3q, strideq
+ mova [dst3q+stride5q*2], m3 ; 13
+ mova [dst3q+strideq*2 ], m4 ; 5
+ mova [dst3q+stride3q*4], m0 ; 15
+ vpalignr m3, m5, m0, 6
+ vpalignr m4, m1, m5, 6
+ mova [dstq+stride3q*4], m3 ; 12
+ mova [dst3q+strideq*1], m4 ; 4
+ vpalignr m3, m5, m0, 8
+ vpalignr m4, m1, m5, 8
+ mova [dst3q+strideq*8], m3 ; 11
+ mova [dst3q+strideq*0], m4 ; 3
+ vpalignr m3, m5, m0, 10
+ vpalignr m4, m1, m5, 10
+ mova [dstq+stride5q*2], m3 ; 10
+ mova [dstq+strideq*2 ], m4 ; 2
+ vpalignr m3, m5, m0, 12
+ vpalignr m4, m1, m5, 12
+ mova [dst3q+stride3q*2], m3 ; 9
+ mova [dstq+strideq*1 ], m4 ; 1
+ vpalignr m3, m5, m0, 14
+ vpalignr m4, m1, m5, 14
+ mova [dstq+strideq*8], m3 ; 8
+ mova [dstq+strideq*0], m4 ; 0
+ mova [dst3q+strideq*4], m5 ; 7
+ RET
+
+%if ARCH_X86_64
+cglobal vp9_ipred_dr_32x32_16, 4, 7, 10, dst, stride, l, a
+ mova m0, [lq+mmsize*0+0] ; l[0-15]
+ mova m1, [lq+mmsize*1+0] ; l[16-31]
+ movu m2, [aq+mmsize*0-2] ; *abcdefghijklmno
+ mova m3, [aq+mmsize*0+0] ; abcdefghijklmnop
+ mova m4, [aq+mmsize*1+0] ; qrstuvwxyz012345
+ vperm2i128 m5, m0, m1, q0201 ; lmnopqrstuvwxyz0
+ vpalignr m6, m5, m0, 2 ; mnopqrstuvwxyz01
+ vpalignr m7, m5, m0, 4 ; nopqrstuvwxyz012
+ LOWPASS 0, 6, 7 ; L[0-15]
+ vperm2i128 m7, m1, m2, q0201 ; stuvwxyz*abcdefg
+ vpalignr m5, m7, m1, 2 ; lmnopqrstuvwxyz*
+ vpalignr m6, m7, m1, 4 ; mnopqrstuvwxyz*a
+ LOWPASS 1, 5, 6 ; L[16-31]#
+ vperm2i128 m5, m3, m4, q0201 ; ijklmnopqrstuvwx
+ vpalignr m6, m5, m3, 2 ; bcdefghijklmnopq
+ LOWPASS 2, 3, 6 ; A[0-15]
+ movu m3, [aq+mmsize*1-2] ; pqrstuvwxyz01234
+ vperm2i128 m6, m4, m4, q2001 ; yz012345........
+ vpalignr m7, m6, m4, 2 ; rstuvwxyz012345.
+ LOWPASS 3, 4, 7 ; A[16-31].
+ vperm2i128 m4, m1, m2, q0201 ; TUVWXYZ#ABCDEFGH
+ vperm2i128 m5, m0, m1, q0201 ; L[7-15]L[16-23]
+ vperm2i128 m8, m2, m3, q0201 ; IJKLMNOPQRSTUVWX
+ DEFINE_ARGS dst8, stride, stride3, stride7, stride5, dst24, cnt
+ lea stride3q, [strideq*3]
+ lea stride5q, [stride3q+strideq*2]
+ lea stride7q, [strideq*4+stride3q]
+ lea dst24q, [dst8q+stride3q*8]
+ lea dst8q, [dst8q+strideq*8]
+ mov cntd, 2
+
+.loop:
+ mova [dst24q+stride7q+0 ], m0 ; 31 23 15 7
+ mova [dst24q+stride7q+32], m1
+ mova [dst8q+stride7q+0], m1
+ mova [dst8q+stride7q+32], m2
+ vpalignr m6, m4, m1, 2
+ vpalignr m7, m5, m0, 2
+ vpalignr m9, m8, m2, 2
+ mova [dst24q+stride3q*2+0], m7 ; 30 22 14 6
+ mova [dst24q+stride3q*2+32], m6
+ mova [dst8q+stride3q*2+0], m6
+ mova [dst8q+stride3q*2+32], m9
+ vpalignr m6, m4, m1, 4
+ vpalignr m7, m5, m0, 4
+ vpalignr m9, m8, m2, 4
+ mova [dst24q+stride5q+0], m7 ; 29 21 13 5
+ mova [dst24q+stride5q+32], m6
+ mova [dst8q+stride5q+0], m6
+ mova [dst8q+stride5q+32], m9
+ vpalignr m6, m4, m1, 6
+ vpalignr m7, m5, m0, 6
+ vpalignr m9, m8, m2, 6
+ mova [dst24q+strideq*4+0 ], m7 ; 28 20 12 4
+ mova [dst24q+strideq*4+32], m6
+ mova [dst8q+strideq*4+0], m6
+ mova [dst8q+strideq*4+32], m9
+ vpalignr m6, m4, m1, 8
+ vpalignr m7, m5, m0, 8
+ vpalignr m9, m8, m2, 8
+ mova [dst24q+stride3q+0 ], m7 ; 27 19 11 3
+ mova [dst24q+stride3q+32], m6
+ mova [dst8q+stride3q+0], m6
+ mova [dst8q+stride3q+32], m9
+ vpalignr m6, m4, m1, 10
+ vpalignr m7, m5, m0, 10
+ vpalignr m9, m8, m2, 10
+ mova [dst24q+strideq*2+0 ], m7 ; 26 18 10 2
+ mova [dst24q+strideq*2+32], m6
+ mova [dst8q+strideq*2+0], m6
+ mova [dst8q+strideq*2+32], m9
+ vpalignr m6, m4, m1, 12
+ vpalignr m7, m5, m0, 12
+ vpalignr m9, m8, m2, 12
+ mova [dst24q+strideq+0 ], m7 ; 25 17 9 1
+ mova [dst24q+strideq+32], m6
+ mova [dst8q+strideq+0], m6
+ mova [dst8q+strideq+32], m9
+ vpalignr m6, m4, m1, 14
+ vpalignr m7, m5, m0, 14
+ vpalignr m9, m8, m2, 14
+ mova [dst24q+strideq*0+0 ], m7 ; 24 16 8 0
+ mova [dst24q+strideq*0+32], m6
+ mova [dst8q+strideq*0+0], m6
+ mova [dst8q+strideq*0+32], m9
+ mova m0, m5
+ mova m5, m1
+ mova m1, m4
+ mova m4, m2
+ mova m2, m8
+ mova m8, m3
+ sub dst24q, stride7q
+ sub dst24q, strideq
+ sub dst8q, stride7q
+ sub dst8q, strideq
+ dec cntd
+ jg .loop
+ RET
+%endif
+%endif
+
+%macro VL_FUNCS 1 ; stack_mem_for_32x32_32bit_function
+cglobal vp9_ipred_vl_4x4_16, 2, 4, 3, dst, stride, l, a
+ movifnidn aq, amp
+ movu m0, [aq] ; abcdefgh
+ psrldq m1, m0, 2 ; bcdefgh.
+ psrldq m2, m0, 4 ; cdefgh..
+ LOWPASS 2, 1, 0 ; BCDEFGH.
+ pavgw m1, m0 ; ABCDEFG.
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+
+ movh [dstq+strideq*0], m1
+ movh [dstq+strideq*1], m2
+ psrldq m1, 2
+ psrldq m2, 2
+ movh [dstq+strideq*2], m1
+ movh [dstq+stride3q ], m2
+ RET
+
+cglobal vp9_ipred_vl_8x8_16, 2, 4, 4, dst, stride, l, a
+ movifnidn aq, amp
+ mova m0, [aq] ; abcdefgh
+%if cpuflag(ssse3)
+ mova m3, [pb_2to15_14_15]
+%endif
+ SHIFT_RIGHTx2 m1, m2, m0, m3 ; bcdefghh/cdefghhh
+ LOWPASS 2, 1, 0 ; BCDEFGHh
+ pavgw m1, m0 ; ABCDEFGh
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+
+ mova [dstq+strideq*0], m1
+ mova [dstq+strideq*1], m2
+ SHIFT_RIGHT m1, m1, m3
+ SHIFT_RIGHT m2, m2, m3
+ mova [dstq+strideq*2], m1
+ mova [dstq+stride3q ], m2
+ lea dstq, [dstq+strideq*4]
+ SHIFT_RIGHT m1, m1, m3
+ SHIFT_RIGHT m2, m2, m3
+ mova [dstq+strideq*0], m1
+ mova [dstq+strideq*1], m2
+ SHIFT_RIGHT m1, m1, m3
+ SHIFT_RIGHT m2, m2, m3
+ mova [dstq+strideq*2], m1
+ mova [dstq+stride3q ], m2
+ RET
+
+cglobal vp9_ipred_vl_16x16_16, 2, 4, 6, dst, stride, l, a
+ movifnidn aq, amp
+ mova m0, [aq]
+ mova m1, [aq+mmsize]
+ PALIGNR m2, m1, m0, 2, m3
+ PALIGNR m3, m1, m0, 4, m4
+ LOWPASS 3, 2, 0
+ pavgw m2, m0
+%if cpuflag(ssse3)
+ mova m4, [pb_2to15_14_15]
+%endif
+ SHIFT_RIGHTx2 m5, m0, m1, m4
+ LOWPASS 0, 5, 1
+ pavgw m1, m5
+ DEFINE_ARGS dst, stride, cnt
+ mov cntd, 8
+
+.loop:
+ mova [dstq+strideq*0+ 0], m2
+ mova [dstq+strideq*0+16], m1
+ mova [dstq+strideq*1+ 0], m3
+ mova [dstq+strideq*1+16], m0
+ lea dstq, [dstq+strideq*2]
+%if cpuflag(avx)
+ vpalignr m2, m1, m2, 2
+ vpalignr m3, m0, m3, 2
+%else
+ PALIGNR m5, m1, m2, 2, m4
+ mova m2, m5
+ PALIGNR m5, m0, m3, 2, m4
+ mova m3, m5
+%endif
+ SHIFT_RIGHT m1, m1, m4
+ SHIFT_RIGHT m0, m0, m4
+ dec cntd
+ jg .loop
+ RET
+
+cglobal vp9_ipred_vl_32x32_16, 2, 5, 11, %1 * mmsize * ARCH_X86_32, dst, stride, l, a
+ movifnidn aq, amp
+ mova m0, [aq+mmsize*0]
+ mova m1, [aq+mmsize*1]
+ mova m2, [aq+mmsize*2]
+ PALIGNR m6, m1, m0, 2, m5
+ PALIGNR m7, m1, m0, 4, m5
+ LOWPASS 7, 6, 0
+ pavgw m6, m0
+ SCRATCH 6, 8, rsp+0*mmsize
+ PALIGNR m4, m2, m1, 2, m0
+ PALIGNR m5, m2, m1, 4, m0
+ LOWPASS 5, 4, 1
+ pavgw m4, m1
+ mova m0, [aq+mmsize*3]
+ PALIGNR m1, m0, m2, 2, m6
+ PALIGNR m3, m0, m2, 4, m6
+ LOWPASS 3, 1, 2
+ pavgw m2, m1
+%if cpuflag(ssse3)
+ PRELOAD 10, pb_2to15_14_15, shuf
+%endif
+ SHIFT_RIGHTx2 m6, m1, m0, reg_shuf
+ LOWPASS 1, 6, 0
+ pavgw m0, m6
+%if ARCH_X86_64
+ pshufd m9, m6, q3333
+%endif
+%if cpuflag(avx)
+ UNSCRATCH 6, 8, rsp+0*mmsize
+%endif
+ DEFINE_ARGS dst, stride, cnt, stride16, stride17
+ mov stride16q, strideq
+ mov cntd, 8
+ shl stride16q, 4
+ lea stride17q, [stride16q+strideq]
+
+ ; FIXME m8 is unused for avx, so we could save one register here for win64
+.loop:
+%if notcpuflag(avx)
+ UNSCRATCH 6, 8, rsp+0*mmsize
+%endif
+ mova [dstq+strideq*0+ 0], m6
+ mova [dstq+strideq*0+16], m4
+ mova [dstq+strideq*0+32], m2
+ mova [dstq+strideq*0+48], m0
+ mova [dstq+strideq*1+ 0], m7
+ mova [dstq+strideq*1+16], m5
+ mova [dstq+strideq*1+32], m3
+ mova [dstq+strideq*1+48], m1
+ mova [dstq+stride16q+ 0], m4
+ mova [dstq+stride16q+16], m2
+ mova [dstq+stride16q+32], m0
+%if ARCH_X86_64
+ mova [dstq+stride16q+48], m9
+%endif
+ mova [dstq+stride17q+ 0], m5
+ mova [dstq+stride17q+16], m3
+ mova [dstq+stride17q+32], m1
+%if ARCH_X86_64
+ mova [dstq+stride17q+48], m9
+%endif
+ lea dstq, [dstq+strideq*2]
+%if cpuflag(avx)
+ vpalignr m6, m4, m6, 2
+ vpalignr m4, m2, m4, 2
+ vpalignr m2, m0, m2, 2
+ vpalignr m7, m5, m7, 2
+ vpalignr m5, m3, m5, 2
+ vpalignr m3, m1, m3, 2
+%else
+ SCRATCH 3, 8, rsp+0*mmsize
+%if notcpuflag(ssse3)
+ SCRATCH 1, 10, rsp+1*mmsize
+%endif
+ PALIGNR m3, m4, m6, 2, m1
+ mova m6, m3
+ PALIGNR m3, m2, m4, 2, m1
+ mova m4, m3
+ PALIGNR m3, m0, m2, 2, m1
+ mova m2, m3
+ PALIGNR m3, m5, m7, 2, m1
+ mova m7, m3
+ UNSCRATCH 3, 8, rsp+0*mmsize
+ SCRATCH 6, 8, rsp+0*mmsize
+%if notcpuflag(ssse3)
+ UNSCRATCH 1, 10, rsp+1*mmsize
+ SCRATCH 7, 10, rsp+1*mmsize
+%endif
+ PALIGNR m6, m3, m5, 2, m7
+ mova m5, m6
+ PALIGNR m6, m1, m3, 2, m7
+ mova m3, m6
+%if notcpuflag(ssse3)
+ UNSCRATCH 7, 10, rsp+1*mmsize
+%endif
+%endif
+ SHIFT_RIGHT m1, m1, reg_shuf
+ SHIFT_RIGHT m0, m0, reg_shuf
+ dec cntd
+ jg .loop
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+%assign %%n 0
+%rep 4
+ mova [dstq+strideq*0+48], m0
+ mova [dstq+strideq*1+48], m0
+ mova [dstq+strideq*2+48], m0
+ mova [dstq+stride3q +48], m0
+%if %%n < 3
+ lea dstq, [dstq+strideq*4]
+%endif
+%assign %%n (%%n+1)
+%endrep
+%endif
+ RET
+%endmacro
+
+INIT_XMM sse2
+VL_FUNCS 2
+INIT_XMM ssse3
+VL_FUNCS 1
+INIT_XMM avx
+VL_FUNCS 1
+
+%macro VR_FUNCS 0
+cglobal vp9_ipred_vr_4x4_16, 4, 4, 3, dst, stride, l, a
+ movu m0, [aq-2]
+ movhps m1, [lq]
+ PALIGNR m0, m1, 10, m2 ; xyz*abcd
+ pslldq m1, m0, 2 ; .xyz*abc
+ pslldq m2, m0, 4 ; ..xyz*ab
+ LOWPASS 2, 1, 0 ; ..YZ#ABC
+ pavgw m1, m0 ; ....#ABC
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+
+ movhps [dstq+strideq*0], m1
+ movhps [dstq+strideq*1], m2
+ shufps m0, m2, m1, q3210
+%if cpuflag(ssse3)
+ pshufb m2, [pb_4_5_8to13_8x0]
+%else
+ pshuflw m2, m2, q2222
+ psrldq m2, 6
+%endif
+ psrldq m0, 6
+ movh [dstq+strideq*2], m0
+ movh [dstq+stride3q ], m2
+ RET
+
+cglobal vp9_ipred_vr_8x8_16, 4, 4, 5, dst, stride, l, a
+ movu m1, [aq-2] ; *abcdefg
+ movu m2, [lq] ; stuvwxyz
+ mova m0, [aq] ; abcdefgh
+ PALIGNR m3, m1, m2, 14, m4 ; z*abcdef
+ LOWPASS 3, 1, 0
+ pavgw m0, m1
+ PALIGNR m1, m2, 2, m4 ; tuvwxyz*
+ pslldq m4, m2, 2 ; .stuvwxy
+ LOWPASS 4, 2, 1
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m3
+ PALIGNR m0, m4, 14, m1
+ pslldq m4, 2
+ PALIGNR m3, m4, 14, m1
+ pslldq m4, 2
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ PALIGNR m0, m4, 14, m1
+ pslldq m4, 2
+ PALIGNR m3, m4, 14, m1
+ pslldq m4, 2
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m3
+ PALIGNR m0, m4, 14, m1
+ pslldq m4, 2
+ PALIGNR m3, m4, 14, m4
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m3
+ RET
+
+cglobal vp9_ipred_vr_16x16_16, 4, 4, 8, dst, stride, l, a
+ movu m1, [aq-2] ; *abcdefg
+ movu m2, [aq+mmsize-2] ; hijklmno
+ mova m3, [aq] ; abcdefgh
+ mova m4, [aq+mmsize] ; ijklmnop
+ mova m5, [lq+mmsize] ; stuvwxyz
+ PALIGNR m0, m1, m5, 14, m6 ; z*abcdef
+ movu m6, [aq+mmsize-4] ; ghijklmn
+ LOWPASS 6, 2, 4
+ pavgw m2, m4
+ LOWPASS 0, 1, 3
+ pavgw m3, m1
+ PALIGNR m1, m5, 2, m7 ; tuvwxyz*
+ movu m7, [lq+mmsize-2] ; rstuvwxy
+ LOWPASS 1, 5, 7
+ movu m5, [lq+2] ; lmnopqrs
+ pslldq m4, m5, 2 ; .lmnopqr
+ pslldq m7, m5, 4 ; ..lmnopq
+ LOWPASS 5, 4, 7
+ psrld m4, m1, 16
+ psrld m7, m5, 16
+ pand m1, [pd_65535]
+ pand m5, [pd_65535]
+ packssdw m7, m4
+ packssdw m5, m1
+ DEFINE_ARGS dst, stride, cnt
+ mov cntd, 8
+
+.loop:
+ mova [dstq+strideq*0+ 0], m3
+ mova [dstq+strideq*0+16], m2
+ mova [dstq+strideq*1+ 0], m0
+ mova [dstq+strideq*1+16], m6
+ lea dstq, [dstq+strideq*2]
+ PALIGNR m2, m3, 14, m4
+ PALIGNR m3, m7, 14, m4
+ pslldq m7, 2
+ PALIGNR m6, m0, 14, m4
+ PALIGNR m0, m5, 14, m4
+ pslldq m5, 2
+ dec cntd
+ jg .loop
+ RET
+
+cglobal vp9_ipred_vr_32x32_16, 4, 5, 14, 6 * mmsize * ARCH_X86_32, dst, stride, l, a
+ movu m0, [aq+mmsize*0-2] ; *a[0-6]
+ movu m1, [aq+mmsize*1-2] ; a[7-14]
+ movu m2, [aq+mmsize*2-2] ; a[15-22]
+ movu m3, [aq+mmsize*3-2] ; a[23-30]
+ mova m4, [aq+mmsize*3+0] ; a[24-31]
+ movu m5, [aq+mmsize*3-4] ; a[22-29]
+ LOWPASS 5, 3, 4 ; A[23-30]
+ SCRATCH 5, 8, rsp+0*mmsize
+ pavgw m3, m4
+ mova m4, [aq+mmsize*2+0] ; a[16-23]
+ movu m6, [aq+mmsize*2-4] ; a[14-21]
+ LOWPASS 6, 2, 4 ; A[15-22]
+ SCRATCH 6, 9, rsp+1*mmsize
+ pavgw m2, m4
+ mova m4, [aq+mmsize*1+0] ; a[8-15]
+ movu m7, [aq+mmsize*1-4] ; a[6-13]
+ LOWPASS 7, 1, 4 ; A[7-14]
+ SCRATCH 7, 10, rsp+2*mmsize
+ pavgw m1, m4
+ mova m4, [aq+mmsize*0+0] ; a[0-7]
+ mova m5, [lq+mmsize*3+0] ; l[24-31]
+ PALIGNR m6, m0, m5, 14, m7 ; l[31]*a[0-5]
+ LOWPASS 6, 0, 4 ; #A[0-6]
+ SCRATCH 6, 11, rsp+3*mmsize
+ pavgw m4, m0
+ PALIGNR m0, m5, 2, m7 ; l[25-31]*
+ movu m7, [lq+mmsize*3-2] ; l[23-30]
+ LOWPASS 0, 5, 7 ; L[24-31]
+ movu m5, [lq+mmsize*2-2] ; l[15-22]
+ mova m7, [lq+mmsize*2+0] ; l[16-23]
+ movu m6, [lq+mmsize*2+2] ; l[17-24]
+ LOWPASS 5, 7, 6 ; L[16-23]
+ psrld m7, m0, 16
+ psrld m6, m5, 16
+ pand m0, [pd_65535]
+ pand m5, [pd_65535]
+ packssdw m6, m7
+ packssdw m5, m0
+ SCRATCH 5, 12, rsp+4*mmsize
+ SCRATCH 6, 13, rsp+5*mmsize
+ movu m6, [lq+mmsize*1-2] ; l[7-14]
+ mova m0, [lq+mmsize*1+0] ; l[8-15]
+ movu m5, [lq+mmsize*1+2] ; l[9-16]
+ LOWPASS 6, 0, 5 ; L[8-15]
+ movu m0, [lq+mmsize*0+2] ; l[1-8]
+ pslldq m5, m0, 2 ; .l[1-7]
+ pslldq m7, m0, 4 ; ..l[1-6]
+ LOWPASS 0, 5, 7
+ psrld m5, m6, 16
+ psrld m7, m0, 16
+ pand m6, [pd_65535]
+ pand m0, [pd_65535]
+ packssdw m7, m5
+ packssdw m0, m6
+ UNSCRATCH 6, 13, rsp+5*mmsize
+ DEFINE_ARGS dst, stride, stride16, cnt, stride17
+ mov stride16q, strideq
+ mov cntd, 8
+ shl stride16q, 4
+%if ARCH_X86_64
+ lea stride17q, [stride16q+strideq]
+%endif
+
+.loop:
+ mova [dstq+strideq*0+ 0], m4
+ mova [dstq+strideq*0+16], m1
+ mova [dstq+strideq*0+32], m2
+ mova [dstq+strideq*0+48], m3
+%if ARCH_X86_64
+ mova [dstq+strideq*1+ 0], m11
+ mova [dstq+strideq*1+16], m10
+ mova [dstq+strideq*1+32], m9
+ mova [dstq+strideq*1+48], m8
+%endif
+ mova [dstq+stride16q+ 0], m6
+ mova [dstq+stride16q+16], m4
+ mova [dstq+stride16q+32], m1
+ mova [dstq+stride16q+48], m2
+%if ARCH_X86_64
+ mova [dstq+stride17q+ 0], m12
+ mova [dstq+stride17q+16], m11
+ mova [dstq+stride17q+32], m10
+ mova [dstq+stride17q+48], m9
+%endif
+ lea dstq, [dstq+strideq*2]
+ PALIGNR m3, m2, 14, m5
+ PALIGNR m2, m1, 14, m5
+ PALIGNR m1, m4, 14, m5
+ PALIGNR m4, m6, 14, m5
+ PALIGNR m6, m7, 14, m5
+ pslldq m7, 2
+%if ARCH_X86_64
+ PALIGNR m8, m9, 14, m5
+ PALIGNR m9, m10, 14, m5
+ PALIGNR m10, m11, 14, m5
+ PALIGNR m11, m12, 14, m5
+ PALIGNR m12, m0, 14, m5
+ pslldq m0, 2
+%endif
+ dec cntd
+ jg .loop
+
+%if ARCH_X86_32
+ UNSCRATCH 5, 12, rsp+4*mmsize
+ UNSCRATCH 4, 11, rsp+3*mmsize
+ UNSCRATCH 3, 10, rsp+2*mmsize
+ UNSCRATCH 2, 9, rsp+1*mmsize
+ UNSCRATCH 1, 8, rsp+0*mmsize
+ mov dstq, dstm
+ mov cntd, 8
+ add dstq, strideq
+.loop2:
+ mova [dstq+strideq*0+ 0], m4
+ mova [dstq+strideq*0+16], m3
+ mova [dstq+strideq*0+32], m2
+ mova [dstq+strideq*0+48], m1
+ mova [dstq+stride16q+ 0], m5
+ mova [dstq+stride16q+16], m4
+ mova [dstq+stride16q+32], m3
+ mova [dstq+stride16q+48], m2
+ lea dstq, [dstq+strideq*2]
+ PALIGNR m1, m2, 14, m6
+ PALIGNR m2, m3, 14, m6
+ PALIGNR m3, m4, 14, m6
+ PALIGNR m4, m5, 14, m6
+ PALIGNR m5, m0, 14, m6
+ pslldq m0, 2
+ dec cntd
+ jg .loop2
+%endif
+ RET
+%endmacro
+
+INIT_XMM sse2
+VR_FUNCS
+INIT_XMM ssse3
+VR_FUNCS
+INIT_XMM avx
+VR_FUNCS
+
+%macro HU_FUNCS 1 ; stack_mem_for_32x32_32bit_function
+cglobal vp9_ipred_hu_4x4_16, 3, 3, 3, dst, stride, l, a
+ movh m0, [lq] ; abcd
+%if cpuflag(ssse3)
+ pshufb m0, [pb_0to7_67x4] ; abcddddd
+%else
+ punpcklqdq m0, m0
+ pshufhw m0, m0, q3333 ; abcddddd
+%endif
+ psrldq m1, m0, 2 ; bcddddd.
+ psrldq m2, m0, 4 ; cddddd..
+ LOWPASS 2, 1, 0 ; BCDddd..
+ pavgw m1, m0 ; abcddddd
+ SBUTTERFLY wd, 1, 2, 0 ; aBbCcDdd, dddddddd
+ PALIGNR m2, m1, 4, m0 ; bCcDdddd
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+
+ movh [dstq+strideq*0], m1 ; aBbC
+ movh [dstq+strideq*1], m2 ; bCcD
+ movhps [dstq+strideq*2], m1 ; cDdd
+ movhps [dstq+stride3q ], m2 ; dddd
+ RET
+
+cglobal vp9_ipred_hu_8x8_16, 3, 3, 4, dst, stride, l, a
+ mova m0, [lq]
+%if cpuflag(ssse3)
+ mova m3, [pb_2to15_14_15]
+%endif
+ SHIFT_RIGHTx2 m1, m2, m0, m3
+ LOWPASS 2, 1, 0
+ pavgw m1, m0
+ SBUTTERFLY wd, 1, 2, 0
+ shufps m0, m1, m2, q1032
+ pshufd m3, m2, q3332
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+
+ mova [dstq+strideq *0], m1
+ mova [dstq+strideq *2], m0
+ mova [dstq+strideq *4], m2
+ mova [dstq+stride3q*2], m3
+ add dstq, strideq
+%if cpuflag(avx)
+ vpalignr m1, m2, m1, 4
+%else
+ PALIGNR m0, m2, m1, 4, m3
+ mova m1, m0
+%endif
+ pshufd m2, m2, q3321
+ shufps m0, m1, m2, q1032
+ pshufd m3, m2, q3332
+ mova [dstq+strideq *0], m1
+ mova [dstq+strideq *2], m0
+ mova [dstq+strideq *4], m2
+ mova [dstq+stride3q*2], m3
+ RET
+
+cglobal vp9_ipred_hu_16x16_16, 3, 4, 6 + notcpuflag(ssse3), dst, stride, l, a
+ mova m0, [lq]
+ mova m3, [lq+mmsize]
+ movu m1, [lq+2]
+ movu m2, [lq+4]
+ LOWPASS 2, 1, 0
+ pavgw m1, m0
+ SBUTTERFLY wd, 1, 2, 0
+%if cpuflag(ssse3)
+ mova m5, [pb_2to15_14_15]
+%endif
+ SHIFT_RIGHTx2 m0, m4, m3, m5
+ LOWPASS 4, 0, 3
+ pavgw m3, m0
+ SBUTTERFLY wd, 3, 4, 5
+ pshufd m0, m0, q3333
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ mov cntd, 4
+
+.loop:
+ mova [dstq+strideq *0+ 0], m1
+ mova [dstq+strideq *0+16], m2
+ mova [dstq+strideq *4+ 0], m2
+ mova [dstq+strideq *4+16], m3
+ mova [dstq+strideq *8+ 0], m3
+ mova [dstq+strideq *8+16], m4
+ mova [dstq+stride3q*4+ 0], m4
+ mova [dstq+stride3q*4+16], m0
+ add dstq, strideq
+%if cpuflag(avx)
+ vpalignr m1, m2, m1, 4
+ vpalignr m2, m3, m2, 4
+ vpalignr m3, m4, m3, 4
+ vpalignr m4, m0, m4, 4
+%else
+ PALIGNR m5, m2, m1, 4, m6
+ mova m1, m5
+ PALIGNR m5, m3, m2, 4, m6
+ mova m2, m5
+ PALIGNR m5, m4, m3, 4, m6
+ mova m3, m5
+ PALIGNR m5, m0, m4, 4, m6
+ mova m4, m5
+%endif
+ dec cntd
+ jg .loop
+ RET
+
+cglobal vp9_ipred_hu_32x32_16, 3, 7, 10 + notcpuflag(ssse3), \
+ %1 * -mmsize * ARCH_X86_32, dst, stride, l, a
+ mova m2, [lq+mmsize*0+0]
+ movu m1, [lq+mmsize*0+2]
+ movu m0, [lq+mmsize*0+4]
+ LOWPASS 0, 1, 2
+ pavgw m1, m2
+ SBUTTERFLY wd, 1, 0, 2
+ SCRATCH 1, 8, rsp+0*mmsize
+ mova m4, [lq+mmsize*1+0]
+ movu m3, [lq+mmsize*1+2]
+ movu m2, [lq+mmsize*1+4]
+ LOWPASS 2, 3, 4
+ pavgw m3, m4
+ SBUTTERFLY wd, 3, 2, 4
+ mova m6, [lq+mmsize*2+0]
+ movu m5, [lq+mmsize*2+2]
+ movu m4, [lq+mmsize*2+4]
+ LOWPASS 4, 5, 6
+ pavgw m5, m6
+ SBUTTERFLY wd, 5, 4, 6
+ mova m7, [lq+mmsize*3+0]
+ SCRATCH 0, 9, rsp+1*mmsize
+%if cpuflag(ssse3)
+ mova m0, [pb_2to15_14_15]
+%endif
+ SHIFT_RIGHTx2 m1, m6, m7, m0
+ LOWPASS 6, 1, 7
+ pavgw m7, m1
+ SBUTTERFLY wd, 7, 6, 0
+ pshufd m1, m1, q3333
+ UNSCRATCH 0, 9, rsp+1*mmsize
+ DEFINE_ARGS dst, stride, cnt, stride3, stride4, stride20, stride28
+ lea stride3q, [strideq*3]
+ lea stride4q, [strideq*4]
+ lea stride28q, [stride4q*8]
+ lea stride20q, [stride4q*5]
+ sub stride28q, stride4q
+ mov cntd, 4
+
+.loop:
+%if ARCH_X86_64
+ SWAP 1, 8
+%else
+ mova [rsp+1*mmsize], m1
+ mova m1, [rsp+0*mmsize]
+%endif
+ mova [dstq+strideq *0+ 0], m1
+ mova [dstq+strideq *0+16], m0
+ mova [dstq+strideq *0+32], m3
+ mova [dstq+strideq *0+48], m2
+ mova [dstq+stride4q*1+ 0], m0
+ mova [dstq+stride4q*1+16], m3
+ mova [dstq+stride4q*1+32], m2
+ mova [dstq+stride4q*1+48], m5
+ mova [dstq+stride4q*2+ 0], m3
+ mova [dstq+stride4q*2+16], m2
+ mova [dstq+stride4q*2+32], m5
+ mova [dstq+stride4q*2+48], m4
+%if cpuflag(avx)
+ vpalignr m1, m0, m1, 4
+ vpalignr m0, m3, m0, 4
+ vpalignr m3, m2, m3, 4
+%else
+ SCRATCH 6, 9, rsp+2*mmsize
+%if notcpuflag(ssse3)
+ SCRATCH 7, 10, rsp+3*mmsize
+%endif
+ PALIGNR m6, m0, m1, 4, m7
+ mova m1, m6
+ PALIGNR m6, m3, m0, 4, m7
+ mova m0, m6
+ PALIGNR m6, m2, m3, 4, m7
+ mova m3, m6
+ UNSCRATCH 6, 9, rsp+2*mmsize
+ SCRATCH 0, 9, rsp+2*mmsize
+%if notcpuflag(ssse3)
+ UNSCRATCH 7, 10, rsp+3*mmsize
+ SCRATCH 3, 10, rsp+3*mmsize
+%endif
+%endif
+%if ARCH_X86_64
+ SWAP 1, 8
+%else
+ mova [rsp+0*mmsize], m1
+ mova m1, [rsp+1*mmsize]
+%endif
+ mova [dstq+stride3q*4+ 0], m2
+ mova [dstq+stride3q*4+16], m5
+ mova [dstq+stride3q*4+32], m4
+ mova [dstq+stride3q*4+48], m7
+ mova [dstq+stride4q*4+ 0], m5
+ mova [dstq+stride4q*4+16], m4
+ mova [dstq+stride4q*4+32], m7
+ mova [dstq+stride4q*4+48], m6
+ mova [dstq+stride20q + 0], m4
+ mova [dstq+stride20q +16], m7
+ mova [dstq+stride20q +32], m6
+ mova [dstq+stride20q +48], m1
+ mova [dstq+stride3q*8+ 0], m7
+ mova [dstq+stride3q*8+16], m6
+ mova [dstq+stride3q*8+32], m1
+ mova [dstq+stride3q*8+48], m1
+ mova [dstq+stride28q + 0], m6
+ mova [dstq+stride28q +16], m1
+ mova [dstq+stride28q +32], m1
+ mova [dstq+stride28q +48], m1
+%if cpuflag(avx)
+ vpalignr m2, m5, m2, 4
+ vpalignr m5, m4, m5, 4
+ vpalignr m4, m7, m4, 4
+ vpalignr m7, m6, m7, 4
+ vpalignr m6, m1, m6, 4
+%else
+ PALIGNR m0, m5, m2, 4, m3
+ mova m2, m0
+ PALIGNR m0, m4, m5, 4, m3
+ mova m5, m0
+ PALIGNR m0, m7, m4, 4, m3
+ mova m4, m0
+ PALIGNR m0, m6, m7, 4, m3
+ mova m7, m0
+ PALIGNR m0, m1, m6, 4, m3
+ mova m6, m0
+ UNSCRATCH 0, 9, rsp+2*mmsize
+%if notcpuflag(ssse3)
+ UNSCRATCH 3, 10, rsp+3*mmsize
+%endif
+%endif
+ add dstq, strideq
+ dec cntd
+ jg .loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+HU_FUNCS 4
+INIT_XMM ssse3
+HU_FUNCS 3
+INIT_XMM avx
+HU_FUNCS 2
+
+%macro HD_FUNCS 0
+cglobal vp9_ipred_hd_4x4_16, 4, 4, 4, dst, stride, l, a
+ movh m0, [lq]
+ movhps m0, [aq-2]
+ psrldq m1, m0, 2
+ psrldq m2, m0, 4
+ LOWPASS 2, 1, 0
+ pavgw m1, m0
+ punpcklwd m1, m2
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+
+ movh [dstq+stride3q ], m1
+ movhps [dstq+strideq*1], m1
+ movhlps m2, m2
+ PALIGNR m2, m1, 4, m0
+ movh [dstq+strideq*2], m2
+ movhps [dstq+strideq*0], m2
+ RET
+
+cglobal vp9_ipred_hd_8x8_16, 4, 4, 5, dst, stride, l, a
+ mova m0, [lq]
+ movu m1, [aq-2]
+ PALIGNR m2, m1, m0, 2, m3
+ PALIGNR m3, m1, m0, 4, m4
+ LOWPASS 3, 2, 0
+ pavgw m2, m0
+ SBUTTERFLY wd, 2, 3, 0
+ psrldq m0, m1, 2
+ psrldq m4, m1, 4
+ LOWPASS 1, 0, 4
+ DEFINE_ARGS dst8, mstride, cnt
+ lea dst8q, [dst8q+mstrideq*8]
+ neg mstrideq
+ mov cntd, 4
+
+.loop:
+ add dst8q, mstrideq
+ mova [dst8q+mstrideq*0], m2
+ mova [dst8q+mstrideq*4], m3
+%if cpuflag(avx)
+ vpalignr m2, m3, m2, 4
+ vpalignr m3, m1, m3, 4
+%else
+ PALIGNR m0, m3, m2, 4, m4
+ mova m2, m0
+ PALIGNR m0, m1, m3, 4, m4
+ mova m3, m0
+%endif
+ psrldq m1, 4
+ dec cntd
+ jg .loop
+ RET
+
+cglobal vp9_ipred_hd_16x16_16, 4, 4, 8, dst, stride, l, a
+ mova m2, [lq]
+ movu m1, [lq+2]
+ movu m0, [lq+4]
+ LOWPASS 0, 1, 2
+ pavgw m1, m2
+ mova m4, [lq+mmsize]
+ movu m5, [aq-2]
+ PALIGNR m3, m5, m4, 2, m6
+ PALIGNR m2, m5, m4, 4, m6
+ LOWPASS 2, 3, 4
+ pavgw m3, m4
+ SBUTTERFLY wd, 1, 0, 4
+ SBUTTERFLY wd, 3, 2, 4
+ mova m6, [aq]
+ movu m4, [aq+2]
+ LOWPASS 4, 6, 5
+ movu m5, [aq+mmsize-2]
+ psrldq m6, m5, 2
+ psrldq m7, m5, 4
+ LOWPASS 5, 6, 7
+ DEFINE_ARGS dst, mstride, mstride3, cnt
+ lea dstq, [dstq+mstrideq*8]
+ lea dstq, [dstq+mstrideq*8]
+ neg mstrideq
+ lea mstride3q, [mstrideq*3]
+ mov cntd, 4
+
+.loop:
+ add dstq, mstrideq
+ mova [dstq+mstride3q*4+ 0], m2
+ mova [dstq+mstride3q*4+16], m4
+ mova [dstq+mstrideq *8+ 0], m3
+ mova [dstq+mstrideq *8+16], m2
+ mova [dstq+mstrideq *4+ 0], m0
+ mova [dstq+mstrideq *4+16], m3
+ mova [dstq+mstrideq *0+ 0], m1
+ mova [dstq+mstrideq *0+16], m0
+%if cpuflag(avx)
+ vpalignr m1, m0, m1, 4
+ vpalignr m0, m3, m0, 4
+ vpalignr m3, m2, m3, 4
+ vpalignr m2, m4, m2, 4
+ vpalignr m4, m5, m4, 4
+%else
+ PALIGNR m6, m0, m1, 4, m7
+ mova m1, m6
+ PALIGNR m6, m3, m0, 4, m7
+ mova m0, m6
+ PALIGNR m6, m2, m3, 4, m7
+ mova m3, m6
+ PALIGNR m6, m4, m2, 4, m7
+ mova m2, m6
+ PALIGNR m6, m5, m4, 4, m7
+ mova m4, m6
+%endif
+ psrldq m5, 4
+ dec cntd
+ jg .loop
+ RET
+
+cglobal vp9_ipred_hd_32x32_16, 4, 4 + 3 * ARCH_X86_64, 14, \
+ 10 * -mmsize * ARCH_X86_32, dst, stride, l, a
+ mova m2, [lq+mmsize*0+0]
+ movu m1, [lq+mmsize*0+2]
+ movu m0, [lq+mmsize*0+4]
+ LOWPASS 0, 1, 2
+ pavgw m1, m2
+ SBUTTERFLY wd, 1, 0, 2
+ mova m4, [lq+mmsize*1+0]
+ movu m3, [lq+mmsize*1+2]
+ movu m2, [lq+mmsize*1+4]
+ LOWPASS 2, 3, 4
+ pavgw m3, m4
+ SBUTTERFLY wd, 3, 2, 4
+ SCRATCH 0, 8, rsp+0*mmsize
+ SCRATCH 1, 9, rsp+1*mmsize
+ SCRATCH 2, 10, rsp+2*mmsize
+ SCRATCH 3, 11, rsp+3*mmsize
+ mova m6, [lq+mmsize*2+0]
+ movu m5, [lq+mmsize*2+2]
+ movu m4, [lq+mmsize*2+4]
+ LOWPASS 4, 5, 6
+ pavgw m5, m6
+ SBUTTERFLY wd, 5, 4, 6
+ mova m0, [lq+mmsize*3+0]
+ movu m1, [aq+mmsize*0-2]
+ PALIGNR m7, m1, m0, 2, m2
+ PALIGNR m6, m1, m0, 4, m2
+ LOWPASS 6, 7, 0
+ pavgw m7, m0
+ SBUTTERFLY wd, 7, 6, 0
+ mova m2, [aq+mmsize*0+0]
+ movu m0, [aq+mmsize*0+2]
+ LOWPASS 0, 2, 1
+ movu m1, [aq+mmsize*1-2]
+ mova m2, [aq+mmsize*1+0]
+ movu m3, [aq+mmsize*1+2]
+ LOWPASS 1, 2, 3
+ SCRATCH 6, 12, rsp+6*mmsize
+ SCRATCH 7, 13, rsp+7*mmsize
+ movu m2, [aq+mmsize*2-2]
+ mova m3, [aq+mmsize*2+0]
+ movu m6, [aq+mmsize*2+2]
+ LOWPASS 2, 3, 6
+ movu m3, [aq+mmsize*3-2]
+ psrldq m6, m3, 2
+ psrldq m7, m3, 4
+ LOWPASS 3, 6, 7
+ UNSCRATCH 6, 12, rsp+6*mmsize
+ UNSCRATCH 7, 13, rsp+7*mmsize
+%if ARCH_X86_32
+ mova [rsp+4*mmsize], m4
+ mova [rsp+5*mmsize], m5
+ ; we already backed up m6/m7 earlier on x86-32 in SCRATCH, so we don't need
+ ; to do it again here
+%endif
+ DEFINE_ARGS dst, stride, cnt, stride3, stride4, stride20, stride28
+ mov cntd, 4
+ lea stride3q, [strideq*3]
+%if ARCH_X86_64
+ lea stride4q, [strideq*4]
+ lea stride28q, [stride4q*8]
+ lea stride20q, [stride4q*5]
+ sub stride28q, stride4q
+%endif
+ add dstq, stride3q
+
+ ; x86-32 doesn't have enough registers, so on that platform, we split
+ ; the loop in 2... Otherwise you spend most of the loop (un)scratching
+.loop:
+%if ARCH_X86_64
+ mova [dstq+stride28q + 0], m9
+ mova [dstq+stride28q +16], m8
+ mova [dstq+stride28q +32], m11
+ mova [dstq+stride28q +48], m10
+ mova [dstq+stride3q*8+ 0], m8
+ mova [dstq+stride3q*8+16], m11
+ mova [dstq+stride3q*8+32], m10
+ mova [dstq+stride3q*8+48], m5
+ mova [dstq+stride20q + 0], m11
+ mova [dstq+stride20q +16], m10
+ mova [dstq+stride20q +32], m5
+ mova [dstq+stride20q +48], m4
+ mova [dstq+stride4q*4+ 0], m10
+ mova [dstq+stride4q*4+16], m5
+ mova [dstq+stride4q*4+32], m4
+ mova [dstq+stride4q*4+48], m7
+%endif
+ mova [dstq+stride3q*4+ 0], m5
+ mova [dstq+stride3q*4+16], m4
+ mova [dstq+stride3q*4+32], m7
+ mova [dstq+stride3q*4+48], m6
+ mova [dstq+strideq* 8+ 0], m4
+ mova [dstq+strideq* 8+16], m7
+ mova [dstq+strideq* 8+32], m6
+ mova [dstq+strideq* 8+48], m0
+ mova [dstq+strideq* 4+ 0], m7
+ mova [dstq+strideq* 4+16], m6
+ mova [dstq+strideq* 4+32], m0
+ mova [dstq+strideq* 4+48], m1
+ mova [dstq+strideq* 0+ 0], m6
+ mova [dstq+strideq* 0+16], m0
+ mova [dstq+strideq* 0+32], m1
+ mova [dstq+strideq* 0+48], m2
+ sub dstq, strideq
+%if cpuflag(avx)
+%if ARCH_X86_64
+ vpalignr m9, m8, m9, 4
+ vpalignr m8, m11, m8, 4
+ vpalignr m11, m10, m11, 4
+ vpalignr m10, m5, m10, 4
+%endif
+ vpalignr m5, m4, m5, 4
+ vpalignr m4, m7, m4, 4
+ vpalignr m7, m6, m7, 4
+ vpalignr m6, m0, m6, 4
+ vpalignr m0, m1, m0, 4
+ vpalignr m1, m2, m1, 4
+ vpalignr m2, m3, m2, 4
+%else
+%if ARCH_X86_64
+ PALIGNR m12, m8, m9, 4, m13
+ mova m9, m12
+ PALIGNR m12, m11, m8, 4, m13
+ mova m8, m12
+ PALIGNR m12, m10, m11, 4, m13
+ mova m11, m12
+ PALIGNR m12, m5, m10, 4, m13
+ mova m10, m12
+%endif
+ SCRATCH 3, 12, rsp+8*mmsize, sh
+%if notcpuflag(ssse3)
+ SCRATCH 2, 13, rsp+9*mmsize
+%endif
+ PALIGNR m3, m4, m5, 4, m2
+ mova m5, m3
+ PALIGNR m3, m7, m4, 4, m2
+ mova m4, m3
+ PALIGNR m3, m6, m7, 4, m2
+ mova m7, m3
+ PALIGNR m3, m0, m6, 4, m2
+ mova m6, m3
+ PALIGNR m3, m1, m0, 4, m2
+ mova m0, m3
+%if notcpuflag(ssse3)
+ UNSCRATCH 2, 13, rsp+9*mmsize
+ SCRATCH 0, 13, rsp+9*mmsize
+%endif
+ PALIGNR m3, m2, m1, 4, m0
+ mova m1, m3
+ PALIGNR m3, reg_sh, m2, 4, m0
+ mova m2, m3
+%if notcpuflag(ssse3)
+ UNSCRATCH 0, 13, rsp+9*mmsize
+%endif
+ UNSCRATCH 3, 12, rsp+8*mmsize, sh
+%endif
+ psrldq m3, 4
+ dec cntd
+ jg .loop
+
+%if ARCH_X86_32
+ UNSCRATCH 0, 8, rsp+0*mmsize
+ UNSCRATCH 1, 9, rsp+1*mmsize
+ UNSCRATCH 2, 10, rsp+2*mmsize
+ UNSCRATCH 3, 11, rsp+3*mmsize
+ mova m4, [rsp+4*mmsize]
+ mova m5, [rsp+5*mmsize]
+ mova m6, [rsp+6*mmsize]
+ mova m7, [rsp+7*mmsize]
+ DEFINE_ARGS dst, stride, stride5, stride3
+ lea stride5q, [strideq*5]
+ lea dstq, [dstq+stride5q*4]
+ DEFINE_ARGS dst, stride, cnt, stride3
+ mov cntd, 4
+.loop_2:
+ mova [dstq+stride3q*4+ 0], m1
+ mova [dstq+stride3q*4+16], m0
+ mova [dstq+stride3q*4+32], m3
+ mova [dstq+stride3q*4+48], m2
+ mova [dstq+strideq* 8+ 0], m0
+ mova [dstq+strideq* 8+16], m3
+ mova [dstq+strideq* 8+32], m2
+ mova [dstq+strideq* 8+48], m5
+ mova [dstq+strideq* 4+ 0], m3
+ mova [dstq+strideq* 4+16], m2
+ mova [dstq+strideq* 4+32], m5
+ mova [dstq+strideq* 4+48], m4
+ mova [dstq+strideq* 0+ 0], m2
+ mova [dstq+strideq* 0+16], m5
+ mova [dstq+strideq* 0+32], m4
+ mova [dstq+strideq* 0+48], m7
+ sub dstq, strideq
+%if cpuflag(avx)
+ vpalignr m1, m0, m1, 4
+ vpalignr m0, m3, m0, 4
+ vpalignr m3, m2, m3, 4
+ vpalignr m2, m5, m2, 4
+ vpalignr m5, m4, m5, 4
+ vpalignr m4, m7, m4, 4
+ vpalignr m7, m6, m7, 4
+%else
+ SCRATCH 6, 12, rsp+8*mmsize, sh
+%if notcpuflag(ssse3)
+ SCRATCH 7, 13, rsp+9*mmsize
+%endif
+ PALIGNR m6, m0, m1, 4, m7
+ mova m1, m6
+ PALIGNR m6, m3, m0, 4, m7
+ mova m0, m6
+ PALIGNR m6, m2, m3, 4, m7
+ mova m3, m6
+ PALIGNR m6, m5, m2, 4, m7
+ mova m2, m6
+ PALIGNR m6, m4, m5, 4, m7
+ mova m5, m6
+%if notcpuflag(ssse3)
+ UNSCRATCH 7, 13, rsp+9*mmsize
+ SCRATCH 5, 13, rsp+9*mmsize
+%endif
+ PALIGNR m6, m7, m4, 4, m5
+ mova m4, m6
+ PALIGNR m6, reg_sh, m7, 4, m5
+ mova m7, m6
+%if notcpuflag(ssse3)
+ UNSCRATCH 5, 13, rsp+9*mmsize
+%endif
+ UNSCRATCH 6, 12, rsp+8*mmsize, sh
+%endif
+ psrldq m6, 4
+ dec cntd
+ jg .loop_2
+%endif
+ RET
+%endmacro
+
+INIT_XMM sse2
+HD_FUNCS
+INIT_XMM ssse3
+HD_FUNCS
+INIT_XMM avx
+HD_FUNCS
diff --git a/libs/ffvpx/libavcodec/x86/vp9itxfm.asm b/libs/ffvpx/libavcodec/x86/vp9itxfm.asm
new file mode 100644
index 000000000..2c63fe514
--- /dev/null
+++ b/libs/ffvpx/libavcodec/x86/vp9itxfm.asm
@@ -0,0 +1,3197 @@
+;******************************************************************************
+;* VP9 IDCT SIMD optimizations
+;*
+;* Copyright (C) 2013 Clément Bœsch <u pkh me>
+;* Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+%include "vp9itxfm_template.asm"
+
+SECTION_RODATA 32
+
+%macro VP9_IDCT_COEFFS 2-3 0
+const pw_m%1_%2
+times 8 dw -%1, %2
+const pw_%2_%1
+times 8 dw %2, %1
+
+%if %3 == 1
+const pw_m%2_m%1
+times 8 dw -%2, -%1
+%if %1 != %2
+const pw_m%2_%1
+times 8 dw -%2, %1
+const pw_%1_%2
+times 8 dw %1, %2
+%endif
+%endif
+
+%if %1 < 11585
+pw_m%1x2: times 16 dw -%1*2
+%elif %1 > 11585
+pw_%1x2: times 16 dw %1*2
+%else
+const pw_%1x2
+times 16 dw %1*2
+%endif
+
+%if %2 != %1
+pw_%2x2: times 16 dw %2*2
+%endif
+%endmacro
+
+VP9_IDCT_COEFFS 16364, 804
+VP9_IDCT_COEFFS 16305, 1606
+VP9_IDCT_COEFFS 16069, 3196, 1
+VP9_IDCT_COEFFS 15893, 3981
+VP9_IDCT_COEFFS 15137, 6270, 1
+VP9_IDCT_COEFFS 14811, 7005
+VP9_IDCT_COEFFS 14449, 7723
+VP9_IDCT_COEFFS 13160, 9760
+VP9_IDCT_COEFFS 11585, 11585, 1
+VP9_IDCT_COEFFS 11003, 12140
+VP9_IDCT_COEFFS 10394, 12665
+VP9_IDCT_COEFFS 9102, 13623, 1
+VP9_IDCT_COEFFS 8423, 14053
+VP9_IDCT_COEFFS 5520, 15426
+VP9_IDCT_COEFFS 4756, 15679
+VP9_IDCT_COEFFS 2404, 16207
+
+const pw_5283_13377
+times 4 dw 5283, 13377
+const pw_9929_13377
+times 4 dw 9929, 13377
+const pw_15212_m13377
+times 4 dw 15212, -13377
+const pw_15212_9929
+times 4 dw 15212, 9929
+const pw_m5283_m15212
+times 4 dw -5283, -15212
+const pw_13377x2
+times 8 dw 13377*2
+const pw_m13377_13377
+times 4 dw -13377, 13377
+const pw_13377_0
+times 4 dw 13377, 0
+
+cextern pw_8
+cextern pw_16
+cextern pw_32
+cextern pw_512
+cextern pw_1024
+cextern pw_2048
+cextern pw_m1
+cextern pd_8192
+
+SECTION .text
+
+%macro VP9_UNPACK_MULSUB_2D_4X 6 ; dst1 [src1], dst2 [src2], dst3, dst4, mul1, mul2
+ punpckhwd m%4, m%2, m%1
+ punpcklwd m%2, m%1
+ pmaddwd m%3, m%4, [pw_m%5_%6]
+ pmaddwd m%4, [pw_%6_%5]
+ pmaddwd m%1, m%2, [pw_m%5_%6]
+ pmaddwd m%2, [pw_%6_%5]
+%endmacro
+
+%macro VP9_RND_SH_SUMSUB_BA 6 ; dst1 [src1], dst2 [src2], src3, src4, tmp, round
+ SUMSUB_BA d, %1, %2, %5
+ SUMSUB_BA d, %3, %4, %5
+ paddd m%1, %6
+ paddd m%2, %6
+ paddd m%3, %6
+ paddd m%4, %6
+ psrad m%1, 14
+ psrad m%2, 14
+ psrad m%3, 14
+ psrad m%4, 14
+ packssdw m%1, m%3
+ packssdw m%2, m%4
+%endmacro
+
+%macro VP9_STORE_2X 5-6 dstq ; reg1, reg2, tmp1, tmp2, zero, dst
+%if mmsize == 32
+ pmovzxbw m%3, [%6]
+ pmovzxbw m%4, [%6+strideq]
+%else
+ movh m%3, [%6]
+ movh m%4, [%6+strideq]
+ punpcklbw m%3, m%5
+ punpcklbw m%4, m%5
+%endif
+ paddw m%3, m%1
+ paddw m%4, m%2
+%if mmsize == 32
+ packuswb m%3, m%4
+ ; Intel...
+ vpermq m%3, m%3, q3120
+ mova [%6], xm%3
+ vextracti128 [%6+strideq], m%3, 1
+%elif mmsize == 16
+ packuswb m%3, m%4
+ movh [%6], m%3
+ movhps [%6+strideq], m%3
+%else
+ packuswb m%3, m%5
+ packuswb m%4, m%5
+ movh [%6], m%3
+ movh [%6+strideq], m%4
+%endif
+%endmacro
+
+%macro ZERO_BLOCK 4 ; mem, stride, nnzcpl, zero_reg
+%assign %%y 0
+%rep %3
+%assign %%x 0
+%rep %3*2/mmsize
+ mova [%1+%%y+%%x], %4
+%assign %%x (%%x+mmsize)
+%endrep
+%assign %%y (%%y+%2)
+%endrep
+%endmacro
+
+;-------------------------------------------------------------------------------------------
+; void vp9_iwht_iwht_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;-------------------------------------------------------------------------------------------
+
+INIT_MMX mmx
+cglobal vp9_iwht_iwht_4x4_add, 3, 3, 0, dst, stride, block, eob
+ mova m0, [blockq+0*8]
+ mova m1, [blockq+1*8]
+ mova m2, [blockq+2*8]
+ mova m3, [blockq+3*8]
+ psraw m0, 2
+ psraw m1, 2
+ psraw m2, 2
+ psraw m3, 2
+
+ VP9_IWHT4_1D
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+ VP9_IWHT4_1D
+
+ pxor m4, m4
+ VP9_STORE_2X 0, 1, 5, 6, 4
+ lea dstq, [dstq+strideq*2]
+ VP9_STORE_2X 2, 3, 5, 6, 4
+ ZERO_BLOCK blockq, 8, 4, m4
+ RET
+
+;-------------------------------------------------------------------------------------------
+; void vp9_idct_idct_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;-------------------------------------------------------------------------------------------
+
+; 2x2 top left corner
+%macro VP9_IDCT4_2x2_1D 0
+ pmulhrsw m0, m5 ; m0=t1
+ mova m2, m0 ; m2=t0
+ mova m3, m1
+ pmulhrsw m1, m6 ; m1=t2
+ pmulhrsw m3, m7 ; m3=t3
+ VP9_IDCT4_1D_FINALIZE
+%endmacro
+
+%macro VP9_IDCT4_WRITEOUT 0
+%if cpuflag(ssse3)
+ mova m5, [pw_2048]
+ pmulhrsw m0, m5 ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4
+ pmulhrsw m1, m5
+%else
+ mova m5, [pw_8]
+ paddw m0, m5
+ paddw m1, m5
+ psraw m0, 4
+ psraw m1, 4
+%endif
+ VP9_STORE_2X 0, 1, 6, 7, 4
+ lea dstq, [dstq+2*strideq]
+%if cpuflag(ssse3)
+ pmulhrsw m2, m5
+ pmulhrsw m3, m5
+%else
+ paddw m2, m5
+ paddw m3, m5
+ psraw m2, 4
+ psraw m3, 4
+%endif
+ VP9_STORE_2X 2, 3, 6, 7, 4
+%endmacro
+
+%macro IDCT_4x4_FN 1
+INIT_MMX %1
+cglobal vp9_idct_idct_4x4_add, 4, 4, 0, dst, stride, block, eob
+
+%if cpuflag(ssse3)
+ cmp eobd, 4 ; 2x2 or smaller
+ jg .idctfull
+
+ cmp eobd, 1 ; faster path for when only DC is set
+ jne .idct2x2
+%else
+ cmp eobd, 1
+ jg .idctfull
+%endif
+
+%if cpuflag(ssse3)
+ movd m0, [blockq]
+ mova m5, [pw_11585x2]
+ pmulhrsw m0, m5
+ pmulhrsw m0, m5
+%else
+ DEFINE_ARGS dst, stride, block, coef
+ movsx coefd, word [blockq]
+ imul coefd, 11585
+ add coefd, 8192
+ sar coefd, 14
+ imul coefd, 11585
+ add coefd, (8 << 14) + 8192
+ sar coefd, 14 + 4
+ movd m0, coefd
+%endif
+ pshufw m0, m0, 0
+ pxor m4, m4
+ movh [blockq], m4
+%if cpuflag(ssse3)
+ pmulhrsw m0, [pw_2048] ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4
+%endif
+ VP9_STORE_2X 0, 0, 6, 7, 4
+ lea dstq, [dstq+2*strideq]
+ VP9_STORE_2X 0, 0, 6, 7, 4
+ RET
+
+%if cpuflag(ssse3)
+; faster path for when only top left 2x2 block is set
+.idct2x2:
+ movd m0, [blockq+0]
+ movd m1, [blockq+8]
+ mova m5, [pw_11585x2]
+ mova m6, [pw_6270x2]
+ mova m7, [pw_15137x2]
+ VP9_IDCT4_2x2_1D
+ ; partial 2x4 transpose
+ punpcklwd m0, m1
+ punpcklwd m2, m3
+ SBUTTERFLY dq, 0, 2, 1
+ SWAP 1, 2
+ VP9_IDCT4_2x2_1D
+ pxor m4, m4 ; used for the block reset, and VP9_STORE_2X
+ movh [blockq+ 0], m4
+ movh [blockq+ 8], m4
+ VP9_IDCT4_WRITEOUT
+ RET
+%endif
+
+.idctfull: ; generic full 4x4 idct/idct
+ mova m0, [blockq+ 0]
+ mova m1, [blockq+ 8]
+ mova m2, [blockq+16]
+ mova m3, [blockq+24]
+%if cpuflag(ssse3)
+ mova m6, [pw_11585x2]
+%endif
+ mova m7, [pd_8192] ; rounding
+ VP9_IDCT4_1D
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+ VP9_IDCT4_1D
+ pxor m4, m4 ; used for the block reset, and VP9_STORE_2X
+ mova [blockq+ 0], m4
+ mova [blockq+ 8], m4
+ mova [blockq+16], m4
+ mova [blockq+24], m4
+ VP9_IDCT4_WRITEOUT
+ RET
+%endmacro
+
+IDCT_4x4_FN mmxext
+IDCT_4x4_FN ssse3
+
+;-------------------------------------------------------------------------------------------
+; void vp9_iadst_iadst_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;-------------------------------------------------------------------------------------------
+
+%macro IADST4_FN 5
+INIT_MMX %5
+cglobal vp9_%1_%3_4x4_add, 3, 3, 0, dst, stride, block, eob
+%if WIN64 && notcpuflag(ssse3)
+ WIN64_SPILL_XMM 8
+%endif
+ movdqa xmm5, [pd_8192]
+ mova m0, [blockq+ 0]
+ mova m1, [blockq+ 8]
+ mova m2, [blockq+16]
+ mova m3, [blockq+24]
+%if cpuflag(ssse3)
+ mova m6, [pw_11585x2]
+%endif
+%ifnidn %1%3, iadstiadst
+ movdq2q m7, xmm5
+%endif
+ VP9_%2_1D
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+ VP9_%4_1D
+ pxor m4, m4 ; used for the block reset, and VP9_STORE_2X
+ mova [blockq+ 0], m4
+ mova [blockq+ 8], m4
+ mova [blockq+16], m4
+ mova [blockq+24], m4
+ VP9_IDCT4_WRITEOUT
+ RET
+%endmacro
+
+IADST4_FN idct, IDCT4, iadst, IADST4, sse2
+IADST4_FN iadst, IADST4, idct, IDCT4, sse2
+IADST4_FN iadst, IADST4, iadst, IADST4, sse2
+
+IADST4_FN idct, IDCT4, iadst, IADST4, ssse3
+IADST4_FN iadst, IADST4, idct, IDCT4, ssse3
+IADST4_FN iadst, IADST4, iadst, IADST4, ssse3
+
+%macro SCRATCH 3
+%if ARCH_X86_64
+ SWAP %1, %2
+%else
+ mova [%3], m%1
+%endif
+%endmacro
+
+%macro UNSCRATCH 3
+%if ARCH_X86_64
+ SWAP %1, %2
+%else
+ mova m%1, [%3]
+%endif
+%endmacro
+
+;-------------------------------------------------------------------------------------------
+; void vp9_idct_idct_8x8_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;-------------------------------------------------------------------------------------------
+
+%macro VP9_IDCT8_1D_FINALIZE 0
+ SUMSUB_BA w, 3, 6, 5 ; m3=t0+t7, m6=t0-t7
+ SUMSUB_BA w, 1, 2, 5 ; m1=t1+t6, m2=t1-t6
+ SUMSUB_BA w, 7, 0, 5 ; m7=t2+t5, m0=t2-t5
+
+ UNSCRATCH 5, 8, blockq+ 0
+ SCRATCH 2, 8, blockq+ 0
+
+ SUMSUB_BA w, 5, 4, 2 ; m5=t3+t4, m4=t3-t4
+ SWAP 7, 6, 2
+ SWAP 3, 5, 0
+
+%if ARCH_X86_64
+ SWAP 6, 8
+%endif
+%endmacro
+
+; x86-32
+; - in: m0/m4 is in mem
+; - out: m6 is in mem
+; x86-64:
+; - everything is in registers (m0-7)
+%macro VP9_IDCT8_1D 0
+%if ARCH_X86_64
+ SWAP 0, 8
+ SWAP 4, 9
+%endif
+
+ VP9_UNPACK_MULSUB_2W_4X 5, 3, 9102, 13623, D_8192_REG, 0, 4 ; m5=t5a, m3=t6a
+ VP9_UNPACK_MULSUB_2W_4X 1, 7, 16069, 3196, D_8192_REG, 0, 4 ; m1=t4a, m7=t7a
+ SUMSUB_BA w, 5, 1, 0 ; m5=t4a+t5a (t4), m1=t4a-t5a (t5a)
+ SUMSUB_BA w, 3, 7, 0 ; m3=t7a+t6a (t7), m7=t7a-t6a (t6a)
+%if cpuflag(ssse3)
+ SUMSUB_BA w, 1, 7, 0 ; m1=t6a+t5a (t6), m7=t6a-t5a (t5)
+ pmulhrsw m1, W_11585x2_REG ; m1=t6
+ pmulhrsw m7, W_11585x2_REG ; m7=t5
+%else
+ VP9_UNPACK_MULSUB_2W_4X 7, 1, 11585, 11585, D_8192_REG, 0, 4
+%endif
+ VP9_UNPACK_MULSUB_2W_4X 2, 6, 15137, 6270, D_8192_REG, 0, 4 ; m2=t2a, m6=t3a
+
+ UNSCRATCH 0, 8, blockq+ 0 ; IN(0)
+ UNSCRATCH 4, 9, blockq+64 ; IN(4)
+ SCRATCH 5, 8, blockq+ 0
+
+%if cpuflag(ssse3)
+ SUMSUB_BA w, 4, 0, 5 ; m4=IN(0)+IN(4) m0=IN(0)-IN(4)
+ pmulhrsw m4, W_11585x2_REG ; m4=t0a
+ pmulhrsw m0, W_11585x2_REG ; m0=t1a
+%else
+ SCRATCH 7, 9, blockq+64
+ VP9_UNPACK_MULSUB_2W_4X 0, 4, 11585, 11585, D_8192_REG, 5, 7
+ UNSCRATCH 7, 9, blockq+64
+%endif
+ SUMSUB_BA w, 6, 4, 5 ; m6=t0a+t3a (t0), m4=t0a-t3a (t3)
+ SUMSUB_BA w, 2, 0, 5 ; m2=t1a+t2a (t1), m0=t1a-t2a (t2)
+
+ VP9_IDCT8_1D_FINALIZE
+%endmacro
+
+%macro VP9_IDCT8_4x4_1D 0
+ pmulhrsw m0, W_11585x2_REG ; m0=t1a/t0a
+ pmulhrsw m6, m2, [pw_15137x2] ; m6=t3a
+ pmulhrsw m2, [pw_6270x2] ; m2=t2a
+ pmulhrsw m7, m1, [pw_16069x2] ; m7=t7a
+ pmulhrsw m1, [pw_3196x2] ; m1=t4a
+ pmulhrsw m5, m3, [pw_m9102x2] ; m5=t5a
+ pmulhrsw m3, [pw_13623x2] ; m3=t6a
+ SUMSUB_BA w, 5, 1, 4 ; m1=t4a+t5a (t4), m5=t4a-t5a (t5a)
+ SUMSUB_BA w, 3, 7, 4 ; m3=t7a+t6a (t7), m7=t7a-t6a (t6a)
+ SUMSUB_BA w, 1, 7, 4 ; m1=t6a+t5a (t6), m7=t6a-t5a (t5)
+ pmulhrsw m1, W_11585x2_REG ; m1=t6
+ pmulhrsw m7, W_11585x2_REG ; m7=t5
+ psubw m4, m0, m6 ; m4=t0a-t3a (t3)
+ paddw m6, m0 ; m6=t0a+t3a (t0)
+ SCRATCH 5, 8, blockq+ 0
+ SUMSUB_BA w, 2, 0, 5 ; m2=t1a+t2a (t1), m0=t1a-t2a (t2)
+ VP9_IDCT8_1D_FINALIZE
+%endmacro
+
+%macro VP9_IDCT8_2x2_1D 1
+ pmulhrsw m0, W_11585x2_REG ; m0=t0
+ pmulhrsw m3, m1, W_16069x2_REG ; m3=t7
+ pmulhrsw m1, W_3196x2_REG ; m1=t4
+ psubw m7, m3, m1 ; t5 = t7a - t4a
+ paddw m5, m3, m1 ; t6 = t7a + t4a
+ pmulhrsw m7, W_11585x2_REG ; m7=t5
+ pmulhrsw m5, W_11585x2_REG ; m5=t6
+ SWAP 5, 1
+ ; merged VP9_IDCT8_1D_FINALIZE to make register-sharing w/ avx easier
+ psubw m6, m0, m3 ; m6=t0-t7
+ paddw m3, m0 ; m3=t0+t7
+ psubw m2, m0, m1 ; m2=t1-t6
+ paddw m1, m0 ; m1=t1+t6
+%if %1 == 1
+ punpcklwd m3, m1
+%define SCRATCH_REG 1
+%elif ARCH_X86_32
+ mova [blockq+ 0], m2
+%define SCRATCH_REG 2
+%else
+%define SCRATCH_REG 8
+%endif
+ psubw m4, m0, m5 ; m4=t3-t4
+ paddw m5, m0 ; m5=t3+t4
+ SUMSUB_BA w, 7, 0, SCRATCH_REG ; m7=t2+t5, m0=t2-t5
+ SWAP 7, 6, 2
+ SWAP 3, 5, 0
+%undef SCRATCH_REG
+%endmacro
+
+%macro VP9_IDCT8_WRITEx2 6-8 5 ; line1, line2, tmp1, tmp2, zero, pw_1024/pw_16, shift
+%if cpuflag(ssse3)
+ pmulhrsw m%1, %6 ; (x*1024 + (1<<14))>>15 <=> (x+16)>>5
+ pmulhrsw m%2, %6
+%else
+ paddw m%1, %6
+ paddw m%2, %6
+ psraw m%1, %7
+ psraw m%2, %7
+%endif
+%if %0 <= 7
+ VP9_STORE_2X %1, %2, %3, %4, %5
+%else
+ VP9_STORE_2X %1, %2, %3, %4, %5, %8
+%endif
+%endmacro
+
+; x86-32:
+; - m6 is in mem
+; x86-64:
+; - m8 holds m6 (SWAP)
+; m6 holds zero
+%macro VP9_IDCT8_WRITEOUT 0
+%if ARCH_X86_64
+%if cpuflag(ssse3)
+ mova m9, [pw_1024]
+%else
+ mova m9, [pw_16]
+%endif
+%define ROUND_REG m9
+%else
+%if cpuflag(ssse3)
+%define ROUND_REG [pw_1024]
+%else
+%define ROUND_REG [pw_16]
+%endif
+%endif
+ SCRATCH 5, 10, blockq+16
+ SCRATCH 7, 11, blockq+32
+ VP9_IDCT8_WRITEx2 0, 1, 5, 7, 6, ROUND_REG
+ lea dstq, [dstq+2*strideq]
+ VP9_IDCT8_WRITEx2 2, 3, 5, 7, 6, ROUND_REG
+ lea dstq, [dstq+2*strideq]
+ UNSCRATCH 5, 10, blockq+16
+ UNSCRATCH 7, 11, blockq+32
+ VP9_IDCT8_WRITEx2 4, 5, 0, 1, 6, ROUND_REG
+ lea dstq, [dstq+2*strideq]
+ UNSCRATCH 5, 8, blockq+ 0
+ VP9_IDCT8_WRITEx2 5, 7, 0, 1, 6, ROUND_REG
+
+%undef ROUND_REG
+%endmacro
+
+%macro VP9_IDCT_IDCT_8x8_ADD_XMM 2
+INIT_XMM %1
+cglobal vp9_idct_idct_8x8_add, 4, 4, %2, dst, stride, block, eob
+
+%if cpuflag(ssse3)
+%if ARCH_X86_64
+ mova m12, [pw_11585x2] ; often used
+%define W_11585x2_REG m12
+%else
+%define W_11585x2_REG [pw_11585x2]
+%endif
+
+ cmp eobd, 12 ; top left half or less
+ jg .idctfull
+
+ cmp eobd, 3 ; top left corner or less
+ jg .idcthalf
+
+ cmp eobd, 1 ; faster path for when only DC is set
+ jne .idcttopleftcorner
+%else
+ cmp eobd, 1
+ jg .idctfull
+%endif
+
+%if cpuflag(ssse3)
+ movd m0, [blockq]
+ pmulhrsw m0, W_11585x2_REG
+ pmulhrsw m0, W_11585x2_REG
+%else
+ DEFINE_ARGS dst, stride, block, coef
+ movsx coefd, word [blockq]
+ imul coefd, 11585
+ add coefd, 8192
+ sar coefd, 14
+ imul coefd, 11585
+ add coefd, (16 << 14) + 8192
+ sar coefd, 14 + 5
+ movd m0, coefd
+%endif
+ SPLATW m0, m0, 0
+ pxor m4, m4
+ movd [blockq], m4
+%if cpuflag(ssse3)
+ pmulhrsw m0, [pw_1024] ; (x*1024 + (1<<14))>>15 <=> (x+16)>>5
+%endif
+%rep 3
+ VP9_STORE_2X 0, 0, 6, 7, 4
+ lea dstq, [dstq+2*strideq]
+%endrep
+ VP9_STORE_2X 0, 0, 6, 7, 4
+ RET
+
+%if cpuflag(ssse3)
+; faster path for when only left corner is set (3 input: DC, right to DC, below
+; to DC). Note: also working with a 2x2 block
+.idcttopleftcorner:
+ movd m0, [blockq+0]
+ movd m1, [blockq+16]
+%if ARCH_X86_64
+ mova m10, [pw_3196x2]
+ mova m11, [pw_16069x2]
+%define W_3196x2_REG m10
+%define W_16069x2_REG m11
+%else
+%define W_3196x2_REG [pw_3196x2]
+%define W_16069x2_REG [pw_16069x2]
+%endif
+ VP9_IDCT8_2x2_1D 1
+ ; partial 2x8 transpose
+ ; punpcklwd m0, m1 already done inside idct
+ punpcklwd m2, m3
+ punpcklwd m4, m5
+ punpcklwd m6, m7
+ punpckldq m0, m2
+ punpckldq m4, m6
+ SBUTTERFLY qdq, 0, 4, 1
+ SWAP 1, 4
+ VP9_IDCT8_2x2_1D 2
+%if ARCH_X86_64
+ SWAP 6, 8
+%endif
+ pxor m6, m6 ; used for the block reset, and VP9_STORE_2X
+ VP9_IDCT8_WRITEOUT
+%if ARCH_X86_64
+ movd [blockq+ 0], m6
+ movd [blockq+16], m6
+%else
+ mova [blockq+ 0], m6
+ mova [blockq+16], m6
+ mova [blockq+32], m6
+%endif
+ RET
+
+.idcthalf:
+ movh m0, [blockq + 0]
+ movh m1, [blockq +16]
+ movh m2, [blockq +32]
+ movh m3, [blockq +48]
+ VP9_IDCT8_4x4_1D
+ ; partial 4x8 transpose
+%if ARCH_X86_32
+ mova m6, [blockq+ 0]
+%endif
+ punpcklwd m0, m1
+ punpcklwd m2, m3
+ punpcklwd m4, m5
+ punpcklwd m6, m7
+ SBUTTERFLY dq, 0, 2, 1
+ SBUTTERFLY dq, 4, 6, 5
+ SBUTTERFLY qdq, 0, 4, 1
+ SBUTTERFLY qdq, 2, 6, 5
+ SWAP 1, 4
+ SWAP 3, 6
+ VP9_IDCT8_4x4_1D
+%if ARCH_X86_64
+ SWAP 6, 8
+%endif
+ pxor m6, m6
+ VP9_IDCT8_WRITEOUT
+%if ARCH_X86_64
+ movh [blockq+ 0], m6
+ movh [blockq+16], m6
+ movh [blockq+32], m6
+%else
+ mova [blockq+ 0], m6
+ mova [blockq+16], m6
+ mova [blockq+32], m6
+%endif
+ movh [blockq+48], m6
+ RET
+%endif
+
+.idctfull: ; generic full 8x8 idct/idct
+%if ARCH_X86_64
+ mova m0, [blockq+ 0] ; IN(0)
+%endif
+ mova m1, [blockq+ 16] ; IN(1)
+ mova m2, [blockq+ 32] ; IN(2)
+ mova m3, [blockq+ 48] ; IN(3)
+%if ARCH_X86_64
+ mova m4, [blockq+ 64] ; IN(4)
+%endif
+ mova m5, [blockq+ 80] ; IN(5)
+ mova m6, [blockq+ 96] ; IN(6)
+ mova m7, [blockq+112] ; IN(7)
+%if ARCH_X86_64
+ mova m11, [pd_8192] ; rounding
+%define D_8192_REG m11
+%else
+%define D_8192_REG [pd_8192]
+%endif
+ VP9_IDCT8_1D
+%if ARCH_X86_64
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
+%else
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [blockq+0], [blockq+64], 1
+ mova [blockq+0], m0
+%endif
+ VP9_IDCT8_1D
+
+%if ARCH_X86_64
+ SWAP 6, 8
+%endif
+ pxor m6, m6 ; used for the block reset, and VP9_STORE_2X
+ VP9_IDCT8_WRITEOUT
+ ZERO_BLOCK blockq, 16, 8, m6
+ RET
+%undef W_11585x2_REG
+%endmacro
+
+VP9_IDCT_IDCT_8x8_ADD_XMM sse2, 12
+VP9_IDCT_IDCT_8x8_ADD_XMM ssse3, 13
+VP9_IDCT_IDCT_8x8_ADD_XMM avx, 13
+
+;---------------------------------------------------------------------------------------------
+; void vp9_iadst_iadst_8x8_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;---------------------------------------------------------------------------------------------
+
+; x86-32:
+; - in: m0/3/4/7 are in mem [blockq+N*16]
+; - out: m6 is in mem [blockq+0]
+; x86-64:
+; - everything is in registers
+%macro VP9_IADST8_1D 0 ; input/output=m0/1/2/3/4/5/6/7
+%if ARCH_X86_64
+ SWAP 0, 8
+ SWAP 3, 9
+ SWAP 4, 10
+ SWAP 7, 11
+%endif
+
+ VP9_UNPACK_MULSUB_2D_4X 5, 2, 0, 3, 14449, 7723 ; m5/2=t3[d], m2/4=t2[d]
+ VP9_UNPACK_MULSUB_2D_4X 1, 6, 4, 7, 4756, 15679 ; m1/4=t7[d], m6/7=t6[d]
+ SCRATCH 4, 12, blockq+1*16
+ VP9_RND_SH_SUMSUB_BA 6, 2, 7, 3, 4, D_8192_REG ; m6=t2[w], m2=t6[w]
+ UNSCRATCH 4, 12, blockq+1*16
+ VP9_RND_SH_SUMSUB_BA 1, 5, 4, 0, 3, D_8192_REG ; m1=t3[w], m5=t7[w]
+
+ UNSCRATCH 0, 8, blockq+16*0
+ UNSCRATCH 3, 9, blockq+16*3
+ UNSCRATCH 4, 10, blockq+16*4
+ UNSCRATCH 7, 11, blockq+16*7
+ SCRATCH 1, 8, blockq+16*1
+ SCRATCH 2, 9, blockq+16*2
+ SCRATCH 5, 10, blockq+16*5
+ SCRATCH 6, 11, blockq+16*6
+
+ VP9_UNPACK_MULSUB_2D_4X 7, 0, 1, 2, 16305, 1606 ; m7/1=t1[d], m0/2=t0[d]
+ VP9_UNPACK_MULSUB_2D_4X 3, 4, 5, 6, 10394, 12665 ; m3/5=t5[d], m4/6=t4[d]
+ SCRATCH 1, 12, blockq+ 0*16
+ VP9_RND_SH_SUMSUB_BA 4, 0, 6, 2, 1, D_8192_REG ; m4=t0[w], m0=t4[w]
+ UNSCRATCH 1, 12, blockq+ 0*16
+ VP9_RND_SH_SUMSUB_BA 3, 7, 5, 1, 2, D_8192_REG ; m3=t1[w], m7=t5[w]
+
+ UNSCRATCH 2, 9, blockq+16*2
+ UNSCRATCH 5, 10, blockq+16*5
+ SCRATCH 3, 9, blockq+16*3
+ SCRATCH 4, 10, blockq+16*4
+
+ ; m4=t0, m3=t1, m6=t2, m1=t3, m0=t4, m7=t5, m2=t6, m5=t7
+
+ VP9_UNPACK_MULSUB_2D_4X 0, 7, 1, 3, 15137, 6270 ; m0/1=t5[d], m7/3=t4[d]
+ VP9_UNPACK_MULSUB_2D_4X 5, 2, 4, 6, 6270, 15137 ; m5/4=t6[d], m2/6=t7[d]
+ SCRATCH 1, 12, blockq+ 0*16
+ VP9_RND_SH_SUMSUB_BA 5, 7, 4, 3, 1, D_8192_REG
+ UNSCRATCH 1, 12, blockq+ 0*16
+ PSIGNW m5, W_M1_REG ; m5=out1[w], m7=t6[w]
+ VP9_RND_SH_SUMSUB_BA 2, 0, 6, 1, 3, D_8192_REG ; m2=out6[w], m0=t7[w]
+
+ UNSCRATCH 1, 8, blockq+16*1
+ UNSCRATCH 3, 9, blockq+16*3
+ UNSCRATCH 4, 10, blockq+16*4
+ UNSCRATCH 6, 11, blockq+16*6
+ SCRATCH 2, 8, blockq+16*0
+
+ SUMSUB_BA w, 6, 4, 2 ; m6=out0[w], m4=t2[w]
+ SUMSUB_BA w, 1, 3, 2
+ PSIGNW m1, W_M1_REG ; m1=out7[w], m3=t3[w]
+
+ ; m6=out0, m5=out1, m4=t2, m3=t3, m7=t6, m0=t7, m2=out6, m1=out7
+
+ ; unfortunately, the code below overflows in some cases
+%if 0; cpuflag(ssse3)
+ SUMSUB_BA w, 3, 4, 2
+ SUMSUB_BA w, 0, 7, 2
+ pmulhrsw m3, W_11585x2_REG
+ pmulhrsw m7, W_11585x2_REG
+ pmulhrsw m4, W_11585x2_REG ; out4
+ pmulhrsw m0, W_11585x2_REG ; out2
+%else
+ SCRATCH 5, 9, blockq+16*1
+ VP9_UNPACK_MULSUB_2W_4X 4, 3, 11585, 11585, D_8192_REG, 2, 5
+ VP9_UNPACK_MULSUB_2W_4X 7, 0, 11585, 11585, D_8192_REG, 2, 5
+ UNSCRATCH 5, 9, blockq+16*1
+%endif
+ PSIGNW m3, W_M1_REG ; out3
+ PSIGNW m7, W_M1_REG ; out5
+
+ ; m6=out0, m5=out1, m0=out2, m3=out3, m4=out4, m7=out5, m2=out6, m1=out7
+
+%if ARCH_X86_64
+ SWAP 2, 8
+%endif
+ SWAP 0, 6, 2
+ SWAP 7, 1, 5
+%endmacro
+
+%macro IADST8_FN 6
+INIT_XMM %5
+cglobal vp9_%1_%3_8x8_add, 3, 3, %6, dst, stride, block, eob
+
+%ifidn %1, idct
+%define first_is_idct 1
+%else
+%define first_is_idct 0
+%endif
+
+%ifidn %3, idct
+%define second_is_idct 1
+%else
+%define second_is_idct 0
+%endif
+
+%if ARCH_X86_64
+ mova m0, [blockq+ 0] ; IN(0)
+%endif
+ mova m1, [blockq+ 16] ; IN(1)
+ mova m2, [blockq+ 32] ; IN(2)
+%if ARCH_X86_64 || first_is_idct
+ mova m3, [blockq+ 48] ; IN(3)
+%endif
+%if ARCH_X86_64
+ mova m4, [blockq+ 64] ; IN(4)
+%endif
+ mova m5, [blockq+ 80] ; IN(5)
+ mova m6, [blockq+ 96] ; IN(6)
+%if ARCH_X86_64 || first_is_idct
+ mova m7, [blockq+112] ; IN(7)
+%endif
+%if ARCH_X86_64
+%if cpuflag(ssse3)
+ mova m15, [pw_11585x2] ; often used
+%endif
+ mova m13, [pd_8192] ; rounding
+ mova m14, [pw_m1]
+%define W_11585x2_REG m15
+%define D_8192_REG m13
+%define W_M1_REG m14
+%else
+%define W_11585x2_REG [pw_11585x2]
+%define D_8192_REG [pd_8192]
+%define W_M1_REG [pw_m1]
+%endif
+
+ ; note different calling conventions for idct8 vs. iadst8 on x86-32
+ VP9_%2_1D
+%if ARCH_X86_64
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
+%else
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [blockq+0], [blockq+64], 1
+ mova [blockq+ 0], m0
+%if second_is_idct == 0
+ mova [blockq+ 48], m3
+ mova [blockq+112], m7
+%endif
+%endif
+ VP9_%4_1D
+
+%if ARCH_X86_64
+ SWAP 6, 8
+%endif
+ pxor m6, m6 ; used for the block reset, and VP9_STORE_2X
+ VP9_IDCT8_WRITEOUT
+ ZERO_BLOCK blockq, 16, 8, m6
+ RET
+
+%undef W_11585x2_REG
+%undef first_is_idct
+%undef second_is_idct
+
+%endmacro
+
+IADST8_FN idct, IDCT8, iadst, IADST8, sse2, 15
+IADST8_FN iadst, IADST8, idct, IDCT8, sse2, 15
+IADST8_FN iadst, IADST8, iadst, IADST8, sse2, 15
+IADST8_FN idct, IDCT8, iadst, IADST8, ssse3, 16
+IADST8_FN idct, IDCT8, iadst, IADST8, avx, 16
+IADST8_FN iadst, IADST8, idct, IDCT8, ssse3, 16
+IADST8_FN iadst, IADST8, idct, IDCT8, avx, 16
+IADST8_FN iadst, IADST8, iadst, IADST8, ssse3, 16
+IADST8_FN iadst, IADST8, iadst, IADST8, avx, 16
+
+;---------------------------------------------------------------------------------------------
+; void vp9_idct_idct_16x16_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;---------------------------------------------------------------------------------------------
+
+; x86-64:
+; at the end of this macro, m7 is stored in [%4+15*%5]
+; everything else (t0-6 and t8-15) is stored in m0-6 and m8-15
+; the following sumsubs have not been done yet:
+; SUMSUB_BA w, 6, 9, 15 ; t6, t9
+; SUMSUB_BA w, 7, 8, 15 ; t7, t8
+; or (x86-32) t0-t5 are in m0-m5, t10-t15 are in x11/9/7/5/3/1,
+; and the following simsubs have not been done yet:
+; SUMSUB_BA w, x13, x14, 7 ; t6, t9
+; SUMSUB_BA w, x15, x12, 7 ; t7, t8
+
+%macro VP9_IDCT16_1D_START 6 ; src, nnzc, stride, scratch, scratch_stride, is_iadst
+%if %2 <= 4
+ mova m3, [%1+ 1*%3] ; IN(1)
+ mova m0, [%1+ 3*%3] ; IN(3)
+
+ pmulhrsw m4, m3, [pw_16305x2] ; t14-15
+ pmulhrsw m3, [pw_1606x2] ; t8-9
+ pmulhrsw m7, m0, [pw_m4756x2] ; t10-11
+ pmulhrsw m0, [pw_15679x2] ; t12-13
+
+ ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m14=t5, m13=t6, m15=t7
+ ; m3=t8, m5=t9, m1=t10, m7=t11, m0=t12, m6=t13, m2=t14, m4=t15
+
+ VP9_UNPACK_MULSUB_2W_4X 2, 5, 4, 3, 15137, 6270, [pd_8192], 1, 6 ; t9, t14
+ SCRATCH 4, 10, %4+ 1*%5
+ SCRATCH 5, 11, %4+ 7*%5
+ VP9_UNPACK_MULSUB_2W_4X 6, 1, 0, 7, 6270, m15137, [pd_8192], 4, 5 ; t10, t13
+ UNSCRATCH 5, 11, %4+ 7*%5
+
+ ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7
+ ; m7=t8, m6=t9, m2=t10, m3=t11, m4=t12, m5=t13, m1=t14, m0=t15
+%else
+ mova m5, [%1+ 1*%3] ; IN(1)
+ mova m4, [%1+ 7*%3] ; IN(7)
+%if %2 <= 8
+ pmulhrsw m2, m5, [pw_16305x2] ; t15
+ pmulhrsw m5, [pw_1606x2] ; t8
+ pmulhrsw m3, m4, [pw_m10394x2] ; t9
+ pmulhrsw m4, [pw_12665x2] ; t14
+%else
+ mova m3, [%1+ 9*%3] ; IN(9)
+ mova m2, [%1+15*%3] ; IN(15)
+
+ ; m10=in0, m5=in1, m14=in2, m6=in3, m9=in4, m7=in5, m15=in6, m4=in7
+ ; m11=in8, m3=in9, m12=in10 m0=in11, m8=in12, m1=in13, m13=in14, m2=in15
+
+ VP9_UNPACK_MULSUB_2W_4X 5, 2, 16305, 1606, [pd_8192], 0, 1 ; t8, t15
+ VP9_UNPACK_MULSUB_2W_4X 3, 4, 10394, 12665, [pd_8192], 0, 1 ; t9, t14
+%endif
+
+ SUMSUB_BA w, 3, 5, 0 ; t8, t9
+ SUMSUB_BA w, 4, 2, 0 ; t15, t14
+
+ VP9_UNPACK_MULSUB_2W_4X 2, 5, 15137, 6270, [pd_8192], 0, 1 ; t9, t14
+
+ SCRATCH 4, 10, %4+ 1*%5
+ SCRATCH 5, 11, %4+ 7*%5
+
+ mova m6, [%1+ 3*%3] ; IN(3)
+ mova m7, [%1+ 5*%3] ; IN(5)
+%if %2 <= 8
+ pmulhrsw m0, m7, [pw_14449x2] ; t13
+ pmulhrsw m7, [pw_7723x2] ; t10
+ pmulhrsw m1, m6, [pw_m4756x2] ; t11
+ pmulhrsw m6, [pw_15679x2] ; t12
+%else
+ mova m0, [%1+11*%3] ; IN(11)
+ mova m1, [%1+13*%3] ; IN(13)
+
+ VP9_UNPACK_MULSUB_2W_4X 7, 0, 14449, 7723, [pd_8192], 4, 5 ; t10, t13
+ VP9_UNPACK_MULSUB_2W_4X 1, 6, 4756, 15679, [pd_8192], 4, 5 ; t11, t12
+%endif
+
+ ; m11=t0, m10=t1, m9=t2, m8=t3, m14=t4, m12=t5, m15=t6, m13=t7
+ ; m5=t8, m3=t9, m7=t10, m1=t11, m6=t12, m0=t13, m4=t14, m2=t15
+
+ SUMSUB_BA w, 7, 1, 4 ; t11, t10
+ SUMSUB_BA w, 0, 6, 4 ; t12, t13
+
+ ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m14=t5, m13=t6, m15=t7
+ ; m3=t8, m5=t9, m1=t10, m7=t11, m0=t12, m6=t13, m2=t14, m4=t15
+
+ VP9_UNPACK_MULSUB_2W_4X 6, 1, 6270, m15137, [pd_8192], 4, 5 ; t10, t13
+
+ UNSCRATCH 5, 11, %4+ 7*%5
+%endif
+
+ ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m13=t5, m14=t6, m15=t7
+ ; m3=t8, m2=t9, m6=t10, m7=t11, m0=t12, m1=t13, m5=t14, m4=t15
+
+ SUMSUB_BA w, 7, 3, 4 ; t8, t11
+
+ ; backup first register
+ mova [%4+15*%5], m7
+
+ SUMSUB_BA w, 6, 2, 7 ; t9, t10
+ UNSCRATCH 4, 10, %4+ 1*%5
+ SUMSUB_BA w, 0, 4, 7 ; t15, t12
+ SUMSUB_BA w, 1, 5, 7 ; t14. t13
+
+ ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7
+ ; m7=t8, m6=t9, m2=t10, m3=t11, m4=t12, m5=t13, m1=t14, m0=t15
+
+%if cpuflag(ssse3) && %6 == 0
+ SUMSUB_BA w, 2, 5, 7
+ SUMSUB_BA w, 3, 4, 7
+ pmulhrsw m5, [pw_11585x2] ; t10
+ pmulhrsw m4, [pw_11585x2] ; t11
+ pmulhrsw m3, [pw_11585x2] ; t12
+ pmulhrsw m2, [pw_11585x2] ; t13
+%else
+ SCRATCH 6, 10, %4+ 1*%5
+ VP9_UNPACK_MULSUB_2W_4X 5, 2, 11585, 11585, [pd_8192], 6, 7 ; t10, t13
+ VP9_UNPACK_MULSUB_2W_4X 4, 3, 11585, 11585, [pd_8192], 6, 7 ; t11, t12
+ UNSCRATCH 6, 10, %4+ 1*%5
+%endif
+
+ ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7
+ ; m7=t8, m6=t9, m5=t10, m4=t11, m3=t12, m2=t13, m1=t14, m0=t15
+
+ SCRATCH 0, 8, %4+ 1*%5
+ SCRATCH 1, 9, %4+ 3*%5
+ SCRATCH 2, 10, %4+ 5*%5
+ SCRATCH 3, 11, %4+ 7*%5
+ SCRATCH 4, 12, %4+ 9*%5
+ SCRATCH 5, 13, %4+11*%5
+ SCRATCH 6, 14, %4+13*%5
+
+ ; even (tx8x8)
+%if %2 <= 4
+ mova m3, [%1+ 0*%3] ; IN(0)
+ mova m4, [%1+ 2*%3] ; IN(2)
+
+ pmulhrsw m3, [pw_11585x2] ; t0-t3
+ pmulhrsw m7, m4, [pw_16069x2] ; t6-7
+ pmulhrsw m4, [pw_3196x2] ; t4-5
+
+%if 0 ; overflows :(
+ paddw m6, m7, m4
+ psubw m5, m7, m4
+ pmulhrsw m5, [pw_11585x2] ; t5
+ pmulhrsw m6, [pw_11585x2] ; t6
+%else
+ VP9_UNPACK_MULSUB_2W_4X 5, 6, 7, 4, 11585, 11585, [pd_8192], 0, 1 ; t5, t6
+%endif
+
+ psubw m0, m3, m7
+ paddw m7, m3
+ psubw m1, m3, m6
+ paddw m6, m3
+ psubw m2, m3, m5
+ paddw m5, m3
+
+%if ARCH_X86_32
+ SWAP 0, 7
+%endif
+ SCRATCH 7, 15, %4+12*%5
+%else
+ mova m6, [%1+ 2*%3] ; IN(2)
+ mova m1, [%1+ 4*%3] ; IN(4)
+ mova m7, [%1+ 6*%3] ; IN(6)
+%if %2 <= 8
+ pmulhrsw m0, m1, [pw_15137x2] ; t3
+ pmulhrsw m1, [pw_6270x2] ; t2
+ pmulhrsw m5, m6, [pw_16069x2] ; t7
+ pmulhrsw m6, [pw_3196x2] ; t4
+ pmulhrsw m4, m7, [pw_m9102x2] ; t5
+ pmulhrsw m7, [pw_13623x2] ; t6
+%else
+ mova m4, [%1+10*%3] ; IN(10)
+ mova m0, [%1+12*%3] ; IN(12)
+ mova m5, [%1+14*%3] ; IN(14)
+
+ VP9_UNPACK_MULSUB_2W_4X 1, 0, 15137, 6270, [pd_8192], 2, 3 ; t2, t3
+ VP9_UNPACK_MULSUB_2W_4X 6, 5, 16069, 3196, [pd_8192], 2, 3 ; t4, t7
+ VP9_UNPACK_MULSUB_2W_4X 4, 7, 9102, 13623, [pd_8192], 2, 3 ; t5, t6
+%endif
+
+ SUMSUB_BA w, 4, 6, 2 ; t4, t5
+ SUMSUB_BA w, 7, 5, 2 ; t7, t6
+
+%if cpuflag(ssse3) && %6 == 0
+ SUMSUB_BA w, 6, 5, 2
+ pmulhrsw m5, [pw_11585x2] ; t5
+ pmulhrsw m6, [pw_11585x2] ; t6
+%else
+ VP9_UNPACK_MULSUB_2W_4X 5, 6, 11585, 11585, [pd_8192], 2, 3 ; t5, t6
+%endif
+
+ SCRATCH 5, 15, %4+10*%5
+ mova m2, [%1+ 0*%3] ; IN(0)
+%if %2 <= 8
+ pmulhrsw m2, [pw_11585x2] ; t0 and t1
+ psubw m3, m2, m0
+ paddw m0, m2
+
+ SUMSUB_BA w, 7, 0, 5 ; t0, t7
+%else
+ mova m3, [%1+ 8*%3] ; IN(8)
+
+ ; from 3 stages back
+%if cpuflag(ssse3) && %6 == 0
+ SUMSUB_BA w, 3, 2, 5
+ pmulhrsw m3, [pw_11585x2] ; t0
+ pmulhrsw m2, [pw_11585x2] ; t1
+%else
+ mova [%1+ 0*%3], m0
+ VP9_UNPACK_MULSUB_2W_4X 2, 3, 11585, 11585, [pd_8192], 5, 0 ; t0, t1
+ mova m0, [%1+ 0*%3]
+%endif
+
+ ; from 2 stages back
+ SUMSUB_BA w, 0, 3, 5 ; t0, t3
+
+ SUMSUB_BA w, 7, 0, 5 ; t0, t7
+%endif
+ UNSCRATCH 5, 15, %4+10*%5
+%if ARCH_X86_32
+ SWAP 0, 7
+%endif
+ SCRATCH 7, 15, %4+12*%5
+ SUMSUB_BA w, 1, 2, 7 ; t1, t2
+
+ ; from 1 stage back
+ SUMSUB_BA w, 6, 1, 7 ; t1, t6
+ SUMSUB_BA w, 5, 2, 7 ; t2, t5
+%endif
+ SUMSUB_BA w, 4, 3, 7 ; t3, t4
+
+%if ARCH_X86_64
+ SWAP 0, 8
+ SWAP 1, 9
+ SWAP 2, 10
+ SWAP 3, 11
+ SWAP 4, 12
+ SWAP 5, 13
+ SWAP 6, 14
+
+ SUMSUB_BA w, 0, 15, 7 ; t0, t15
+ SUMSUB_BA w, 1, 14, 7 ; t1, t14
+ SUMSUB_BA w, 2, 13, 7 ; t2, t13
+ SUMSUB_BA w, 3, 12, 7 ; t3, t12
+ SUMSUB_BA w, 4, 11, 7 ; t4, t11
+ SUMSUB_BA w, 5, 10, 7 ; t5, t10
+%else
+ SWAP 1, 6
+ SWAP 2, 5
+ SWAP 3, 4
+ mova [%4+14*%5], m6
+
+%macro %%SUMSUB_BA_STORE 5 ; reg, from_mem, to_mem, scratch, scratch_stride
+ mova m6, [%4+%2*%5]
+ SUMSUB_BA w, 6, %1, 7
+ SWAP %1, 6
+ mova [%4+%3*%5], m6
+%endmacro
+
+ %%SUMSUB_BA_STORE 0, 1, 1, %4, %5 ; t0, t15
+ %%SUMSUB_BA_STORE 1, 3, 3, %4, %5 ; t1, t14
+ %%SUMSUB_BA_STORE 2, 5, 5, %4, %5 ; t2, t13
+ %%SUMSUB_BA_STORE 3, 7, 7, %4, %5 ; t3, t12
+ %%SUMSUB_BA_STORE 4, 9, 9, %4, %5 ; t4, t11
+ %%SUMSUB_BA_STORE 5, 11, 11, %4, %5 ; t5, t10
+%endif
+%endmacro
+
+%macro VP9_IDCT16_1D 2-4 16, 1 ; src, pass, nnzc, is_iadst
+%if %2 == 1
+ VP9_IDCT16_1D_START %1, %3, 32, tmpq, 16, %4
+
+%if ARCH_X86_64
+ ; backup a different register
+ mova m7, [tmpq+15*16]
+ mova [tmpq+ 1*16], m15
+
+ SUMSUB_BA w, 6, 9, 15 ; t6, t9
+ SUMSUB_BA w, 7, 8, 15 ; t7, t8
+
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 15
+ mova [tmpq+ 0], m0
+ mova [tmpq+ 32], m1
+ mova [tmpq+ 64], m2
+ mova [tmpq+ 96], m3
+ mova [tmpq+128], m4
+ mova [tmpq+160], m5
+ mova [tmpq+192], m6
+ mova [tmpq+224], m7
+
+ mova m15, [tmpq+ 1*16]
+ TRANSPOSE8x8W 8, 9, 10, 11, 12, 13, 14, 15, 0
+ mova [tmpq+ 16], m8
+ mova [tmpq+ 48], m9
+ mova [tmpq+ 80], m10
+ mova [tmpq+112], m11
+ mova [tmpq+144], m12
+ mova [tmpq+176], m13
+ mova [tmpq+208], m14
+ mova [tmpq+240], m15
+%else
+ mova m6, [tmpq+13*16]
+ mova m7, [tmpq+14*16]
+ SUMSUB_BA w, 6, 7 ; t6, t9
+ mova [tmpq+14*16], m6
+ mova [tmpq+13*16], m7
+ mova m7, [tmpq+15*16]
+ mova m6, [tmpq+12*16]
+ SUMSUB_BA w, 7, 6 ; t7, t8
+ mova [tmpq+15*16], m6
+
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [tmpq+14*16], [tmpq+ 8*16], 1
+ mova [tmpq+ 0*16], m0
+ mova [tmpq+ 2*16], m1
+ mova [tmpq+ 4*16], m2
+ mova [tmpq+ 6*16], m3
+ mova [tmpq+10*16], m5
+ mova [tmpq+12*16], m6
+ mova [tmpq+14*16], m7
+
+ mova m0, [tmpq+15*16]
+ mova m1, [tmpq+13*16]
+ mova m2, [tmpq+11*16]
+ mova m3, [tmpq+ 9*16]
+ mova m4, [tmpq+ 7*16]
+ mova m5, [tmpq+ 5*16]
+ mova m7, [tmpq+ 1*16]
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [tmpq+ 3*16], [tmpq+ 9*16], 1
+ mova [tmpq+ 1*16], m0
+ mova [tmpq+ 3*16], m1
+ mova [tmpq+ 5*16], m2
+ mova [tmpq+ 7*16], m3
+ mova [tmpq+11*16], m5
+ mova [tmpq+13*16], m6
+ mova [tmpq+15*16], m7
+%endif
+%else ; %2 == 2
+ VP9_IDCT16_1D_START %1, %3, 32, %1, 32, %4
+
+%if cpuflag(ssse3)
+%define ROUND_REG [pw_512]
+%else
+%define ROUND_REG [pw_32]
+%endif
+
+ pxor m7, m7
+%if ARCH_X86_64
+ ; backup more registers
+ mova [%1+ 2*32], m8
+ mova [%1+ 3*32], m9
+
+ VP9_IDCT8_WRITEx2 0, 1, 8, 9, 7, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+ VP9_IDCT8_WRITEx2 2, 3, 8, 9, 7, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+ VP9_IDCT8_WRITEx2 4, 5, 8, 9, 7, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+
+ ; restore from cache
+ SWAP 0, 7 ; move zero from m7 to m0
+ mova m7, [%1+15*32]
+ mova m8, [%1+ 2*32]
+ mova m9, [%1+ 3*32]
+
+ SUMSUB_BA w, 6, 9, 3 ; t6, t9
+ SUMSUB_BA w, 7, 8, 3 ; t7, t8
+
+ VP9_IDCT8_WRITEx2 6, 7, 3, 4, 0, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+ VP9_IDCT8_WRITEx2 8, 9, 3, 4, 0, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+ VP9_IDCT8_WRITEx2 10, 11, 1, 2, 0, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+ VP9_IDCT8_WRITEx2 12, 13, 1, 2, 0, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+ VP9_IDCT8_WRITEx2 14, 15, 1, 2, 0, ROUND_REG, 6
+%else
+ mova [tmpq+ 0*32], m5
+
+ VP9_IDCT8_WRITEx2 0, 1, 5, 6, 7, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+ VP9_IDCT8_WRITEx2 2, 3, 5, 6, 7, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+
+ SWAP 0, 7 ; move zero from m7 to m0
+ mova m5, [tmpq+ 0*32]
+
+ VP9_IDCT8_WRITEx2 4, 5, 1, 2, 0, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+
+ mova m4, [tmpq+13*32]
+ mova m7, [tmpq+14*32]
+ mova m5, [tmpq+15*32]
+ mova m6, [tmpq+12*32]
+ SUMSUB_BADC w, 4, 7, 5, 6, 1
+
+ VP9_IDCT8_WRITEx2 4, 5, 1, 2, 0, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+ VP9_IDCT8_WRITEx2 6, 7, 1, 2, 0, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+
+ mova m4, [tmpq+11*32]
+ mova m5, [tmpq+ 9*32]
+ mova m6, [tmpq+ 7*32]
+ mova m7, [tmpq+ 5*32]
+
+ VP9_IDCT8_WRITEx2 4, 5, 1, 2, 0, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+ VP9_IDCT8_WRITEx2 6, 7, 1, 2, 0, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+
+ mova m4, [tmpq+ 3*32]
+ mova m5, [tmpq+ 1*32]
+
+ VP9_IDCT8_WRITEx2 4, 5, 1, 2, 0, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+%endif
+
+%undef ROUND_REG
+%endif ; %2 == 1/2
+%endmacro
+
+%macro VP9_STORE_2XFULL 6-7 strideq; dc, tmp1, tmp2, tmp3, tmp4, zero, stride
+ mova m%3, [dstq]
+ mova m%5, [dstq+%7]
+ punpcklbw m%2, m%3, m%6
+ punpckhbw m%3, m%6
+ punpcklbw m%4, m%5, m%6
+ punpckhbw m%5, m%6
+ paddw m%2, m%1
+ paddw m%3, m%1
+ paddw m%4, m%1
+ paddw m%5, m%1
+ packuswb m%2, m%3
+ packuswb m%4, m%5
+ mova [dstq], m%2
+ mova [dstq+%7], m%4
+%endmacro
+
+%macro VP9_IDCT_IDCT_16x16_ADD_XMM 1
+INIT_XMM %1
+cglobal vp9_idct_idct_16x16_add, 4, 6, 16, 512, dst, stride, block, eob
+%if cpuflag(ssse3)
+ ; 2x2=eob=3, 4x4=eob=10
+ cmp eobd, 38
+ jg .idctfull
+ cmp eobd, 1 ; faster path for when only DC is set
+ jne .idct8x8
+%else
+ cmp eobd, 1 ; faster path for when only DC is set
+ jg .idctfull
+%endif
+
+ ; dc-only
+%if cpuflag(ssse3)
+ movd m0, [blockq]
+ mova m1, [pw_11585x2]
+ pmulhrsw m0, m1
+ pmulhrsw m0, m1
+%else
+ DEFINE_ARGS dst, stride, block, coef
+ movsx coefd, word [blockq]
+ imul coefd, 11585
+ add coefd, 8192
+ sar coefd, 14
+ imul coefd, 11585
+ add coefd, (32 << 14) + 8192
+ sar coefd, 14 + 6
+ movd m0, coefd
+%endif
+ SPLATW m0, m0, q0000
+%if cpuflag(ssse3)
+ pmulhrsw m0, [pw_512]
+%endif
+ pxor m5, m5
+ movd [blockq], m5
+%rep 7
+ VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5
+ lea dstq, [dstq+2*strideq]
+%endrep
+ VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5
+ RET
+
+ DEFINE_ARGS dst, stride, block, cnt, dst_bak, tmp
+%if cpuflag(ssse3)
+.idct8x8:
+ mov tmpq, rsp
+ VP9_IDCT16_1D blockq, 1, 8, 0
+
+ mov cntd, 2
+ mov dst_bakq, dstq
+.loop2_8x8:
+ VP9_IDCT16_1D tmpq, 2, 8, 0
+ lea dstq, [dst_bakq+8]
+ add tmpq, 16
+ dec cntd
+ jg .loop2_8x8
+
+ ; at the end of the loop, m0 should still be zero
+ ; use that to zero out block coefficients
+ ZERO_BLOCK blockq, 32, 8, m0
+ RET
+%endif
+
+.idctfull:
+ mov cntd, 2
+ mov tmpq, rsp
+.loop1_full:
+ VP9_IDCT16_1D blockq, 1, 16, 0
+ add blockq, 16
+ add tmpq, 256
+ dec cntd
+ jg .loop1_full
+ sub blockq, 32
+
+ mov cntd, 2
+ mov tmpq, rsp
+ mov dst_bakq, dstq
+.loop2_full:
+ VP9_IDCT16_1D tmpq, 2, 16, 0
+ lea dstq, [dst_bakq+8]
+ add tmpq, 16
+ dec cntd
+ jg .loop2_full
+
+ ; at the end of the loop, m0 should still be zero
+ ; use that to zero out block coefficients
+ ZERO_BLOCK blockq, 32, 16, m0
+ RET
+%endmacro
+
+VP9_IDCT_IDCT_16x16_ADD_XMM sse2
+VP9_IDCT_IDCT_16x16_ADD_XMM ssse3
+VP9_IDCT_IDCT_16x16_ADD_XMM avx
+
+%macro VP9_IDCT16_YMM_1D 0
+ VP9_UNPACK_MULSUB_2W_4X 1, 15, 16305, 1606, [pd_8192], 0, 4 ; t8, t15
+ VP9_UNPACK_MULSUB_2W_4X 9, 7, 10394, 12665, [pd_8192], 0, 4 ; t9, t14
+
+ SUMSUB_BA w, 9, 1, 0 ; t8, t9
+ SUMSUB_BA w, 7, 15, 0 ; t15, t14
+
+ VP9_UNPACK_MULSUB_2W_4X 15, 1, 15137, 6270, [pd_8192], 0, 4 ; t9, t14
+
+ VP9_UNPACK_MULSUB_2W_4X 5, 11, 14449, 7723, [pd_8192], 0, 4 ; t10, t13
+ VP9_UNPACK_MULSUB_2W_4X 13, 3, 4756, 15679, [pd_8192], 0, 4 ; t11, t12
+
+ SUMSUB_BA w, 5, 13, 0 ; t11, t10
+ SUMSUB_BA w, 11, 3, 0 ; t12, t13
+
+ VP9_UNPACK_MULSUB_2W_4X 3, 13, 6270, m15137, [pd_8192], 0, 4 ; t10, t13
+
+ SUMSUB_BA w, 5, 9, 0 ; t8, t11
+ SUMSUB_BA w, 3, 15, 0 ; t9, t10
+ SUMSUB_BA w, 11, 7, 0 ; t15, t12
+ SUMSUB_BA w, 13, 1, 0 ; t14, t13
+
+ SUMSUB_BA w, 15, 1, 0
+ SUMSUB_BA w, 9, 7, 0
+ pmulhrsw m1, [pw_11585x2] ; t10
+ pmulhrsw m7, [pw_11585x2] ; t11
+ pmulhrsw m9, [pw_11585x2] ; t12
+ pmulhrsw m15, [pw_11585x2] ; t13
+
+ ; even (tx8x8)
+ mova m4, [blockq+128]
+ mova [blockq+128], m5
+ VP9_UNPACK_MULSUB_2W_4X 4, 12, 15137, 6270, [pd_8192], 0, 5 ; t2, t3
+ VP9_UNPACK_MULSUB_2W_4X 2, 14, 16069, 3196, [pd_8192], 0, 5 ; t4, t7
+ VP9_UNPACK_MULSUB_2W_4X 10, 6, 9102, 13623, [pd_8192], 0, 5 ; t5, t6
+ mova m0, [blockq+ 0]
+ SUMSUB_BA w, 8, 0, 5
+ pmulhrsw m8, [pw_11585x2] ; t0
+ pmulhrsw m0, [pw_11585x2] ; t1
+
+ SUMSUB_BA w, 10, 2, 5 ; t4, t5
+ SUMSUB_BA w, 6, 14, 5 ; t7, t6
+ SUMSUB_BA w, 12, 8, 5 ; t0, t3
+ SUMSUB_BA w, 4, 0, 5 ; t1, t2
+
+ SUMSUB_BA w, 2, 14, 5
+ pmulhrsw m14, [pw_11585x2] ; t5
+ pmulhrsw m2, [pw_11585x2] ; t6
+
+ SUMSUB_BA w, 6, 12, 5 ; t0, t7
+ SUMSUB_BA w, 2, 4, 5 ; t1, t6
+ SUMSUB_BA w, 14, 0, 5 ; t2, t5
+ SUMSUB_BA w, 10, 8, 5 ; t3, t4
+
+ ; final stage
+ SUMSUB_BA w, 11, 6, 5 ; out0, out15
+ SUMSUB_BA w, 13, 2, 5 ; out1, out14
+ SUMSUB_BA w, 15, 14, 5 ; out2, out13
+ SUMSUB_BA w, 9, 10, 5 ; out3, out12
+ SUMSUB_BA w, 7, 8, 5 ; out4, out11
+ SUMSUB_BA w, 1, 0, 5 ; out5, out10
+ SUMSUB_BA w, 3, 4, 5 ; out6, out9
+ mova m5, [blockq+128]
+ mova [blockq+192], m3
+ SUMSUB_BA w, 5, 12, 3 ; out7, out8
+
+ SWAP 0, 11, 8, 12, 10
+ SWAP 1, 13, 14, 2, 15, 6, 3, 9, 4, 7, 5
+%endmacro
+
+; this is almost identical to VP9_STORE_2X, but it does two rows
+; for slightly improved interleaving, and it omits vpermq since the
+; input is DC so all values are identical
+%macro VP9_STORE_YMM_DC_4X 6 ; reg, tmp1, tmp2, tmp3, tmp4, zero
+ mova xm%2, [dstq]
+ mova xm%4, [dstq+strideq*2]
+ vinserti128 m%2, m%2, [dstq+strideq], 1
+ vinserti128 m%4, m%4, [dstq+stride3q], 1
+ punpckhbw m%3, m%2, m%6
+ punpcklbw m%2, m%6
+ punpckhbw m%5, m%4, m%6
+ punpcklbw m%4, m%6
+ paddw m%3, m%1
+ paddw m%2, m%1
+ paddw m%5, m%1
+ paddw m%4, m%1
+ packuswb m%2, m%3
+ packuswb m%4, m%5
+ mova [dstq], xm%2
+ mova [dstq+strideq*2], xm%4
+ vextracti128 [dstq+strideq], m%2, 1
+ vextracti128 [dstq+stride3q], m%4, 1
+%endmacro
+
+%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_idct_idct_16x16_add, 4, 4, 16, dst, stride, block, eob
+ cmp eobd, 1 ; faster path for when only DC is set
+ jg .idctfull
+
+ ; dc-only
+ mova m1, [pw_11585x2]
+ vpbroadcastw m0, [blockq]
+ pmulhrsw m0, m1
+ pmulhrsw m0, m1
+ pxor m5, m5
+ pmulhrsw m0, [pw_512]
+ movd [blockq], xm5
+
+ DEFINE_ARGS dst, stride, stride3, cnt
+ mov cntd, 4
+ lea stride3q, [strideq*3]
+.loop_dc:
+ VP9_STORE_YMM_DC_4X 0, 1, 2, 3, 4, 5
+ lea dstq, [dstq+4*strideq]
+ dec cntd
+ jg .loop_dc
+ RET
+
+ DEFINE_ARGS dst, stride, block, eob
+.idctfull:
+ mova m1, [blockq+ 32]
+ mova m2, [blockq+ 64]
+ mova m3, [blockq+ 96]
+ mova m5, [blockq+160]
+ mova m6, [blockq+192]
+ mova m7, [blockq+224]
+ mova m8, [blockq+256]
+ mova m9, [blockq+288]
+ mova m10, [blockq+320]
+ mova m11, [blockq+352]
+ mova m12, [blockq+384]
+ mova m13, [blockq+416]
+ mova m14, [blockq+448]
+ mova m15, [blockq+480]
+
+ VP9_IDCT16_YMM_1D
+ TRANSPOSE16x16W 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
+ [blockq+192], [blockq+128], 1
+ mova [blockq+ 0], m0
+ VP9_IDCT16_YMM_1D
+
+ mova [blockq+224], m7
+
+ ; store
+ VP9_IDCT8_WRITEx2 0, 1, 6, 7, unused, [pw_512], 6
+ lea dstq, [dstq+2*strideq]
+ VP9_IDCT8_WRITEx2 2, 3, 6, 7, unused, [pw_512], 6
+ lea dstq, [dstq+2*strideq]
+ VP9_IDCT8_WRITEx2 4, 5, 6, 7, unused, [pw_512], 6
+ lea dstq, [dstq+2*strideq]
+ mova m6, [blockq+192]
+ mova m7, [blockq+224]
+ VP9_IDCT8_WRITEx2 6, 7, 1, 2, unused, [pw_512], 6
+ lea dstq, [dstq+2*strideq]
+ VP9_IDCT8_WRITEx2 8, 9, 1, 2, unused, [pw_512], 6
+ lea dstq, [dstq+2*strideq]
+ VP9_IDCT8_WRITEx2 10, 11, 1, 2, unused, [pw_512], 6
+ lea dstq, [dstq+2*strideq]
+ VP9_IDCT8_WRITEx2 12, 13, 1, 2, unused, [pw_512], 6
+ lea dstq, [dstq+2*strideq]
+ VP9_IDCT8_WRITEx2 14, 15, 1, 2, unused, [pw_512], 6
+ lea dstq, [dstq+2*strideq]
+
+ ; at the end of the loop, m0 should still be zero
+ ; use that to zero out block coefficients
+ pxor m0, m0
+ ZERO_BLOCK blockq, 32, 16, m0
+ RET
+%endif
+
+;---------------------------------------------------------------------------------------------
+; void vp9_iadst_iadst_16x16_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;---------------------------------------------------------------------------------------------
+
+%macro VP9_IADST16_1D 2 ; src, pass
+%assign %%str 16*%2
+ mova m0, [%1+ 0*32] ; in0
+ mova m1, [%1+15*32] ; in15
+ mova m2, [%1+ 7*32] ; in7
+ mova m3, [%1+ 8*32] ; in8
+
+ VP9_UNPACK_MULSUB_2D_4X 1, 0, 4, 5, 16364, 804 ; m1/4=t1[d], m0/5=t0[d]
+ VP9_UNPACK_MULSUB_2D_4X 2, 3, 7, 6, 11003, 12140 ; m2/7=t9[d], m3/6=t8[d]
+ SCRATCH 4, 8, tmpq+ 0*%%str
+ VP9_RND_SH_SUMSUB_BA 3, 0, 6, 5, 4, [pd_8192] ; m3=t0[w], m0=t8[w]
+ UNSCRATCH 4, 8, tmpq+ 0*%%str
+ VP9_RND_SH_SUMSUB_BA 2, 1, 7, 4, 5, [pd_8192] ; m2=t1[w], m1=t9[w]
+
+ SCRATCH 0, 10, tmpq+ 0*%%str
+ SCRATCH 1, 11, tmpq+15*%%str
+ mova [tmpq+ 7*%%str], m2
+ mova [tmpq+ 8*%%str], m3
+
+ mova m1, [%1+ 2*32] ; in2
+ mova m0, [%1+13*32] ; in13
+ mova m3, [%1+ 5*32] ; in5
+ mova m2, [%1+10*32] ; in10
+
+ VP9_UNPACK_MULSUB_2D_4X 0, 1, 6, 7, 15893, 3981 ; m0/6=t3[d], m1/7=t2[d]
+ VP9_UNPACK_MULSUB_2D_4X 3, 2, 4, 5, 8423, 14053 ; m3/4=t11[d], m2/5=t10[d]
+ SCRATCH 4, 12, tmpq+ 2*%%str
+ VP9_RND_SH_SUMSUB_BA 2, 1, 5, 7, 4, [pd_8192] ; m2=t2[w], m1=t10[w]
+ UNSCRATCH 4, 12, tmpq+ 2*%%str
+ VP9_RND_SH_SUMSUB_BA 3, 0, 4, 6, 5, [pd_8192] ; m3=t3[w], m0=t11[w]
+
+ SCRATCH 0, 12, tmpq+ 2*%%str
+ SCRATCH 1, 13, tmpq+13*%%str
+ mova [tmpq+ 5*%%str], m2
+ mova [tmpq+10*%%str], m3
+
+ mova m2, [%1+ 4*32] ; in4
+ mova m3, [%1+11*32] ; in11
+ mova m0, [%1+ 3*32] ; in3
+ mova m1, [%1+12*32] ; in12
+
+ VP9_UNPACK_MULSUB_2D_4X 3, 2, 7, 6, 14811, 7005 ; m3/7=t5[d], m2/6=t4[d]
+ VP9_UNPACK_MULSUB_2D_4X 0, 1, 4, 5, 5520, 15426 ; m0/4=t13[d], m1/5=t12[d]
+ SCRATCH 4, 9, tmpq+ 4*%%str
+ VP9_RND_SH_SUMSUB_BA 1, 2, 5, 6, 4, [pd_8192] ; m1=t4[w], m2=t12[w]
+ UNSCRATCH 4, 9, tmpq+ 4*%%str
+ VP9_RND_SH_SUMSUB_BA 0, 3, 4, 7, 6, [pd_8192] ; m0=t5[w], m3=t13[w]
+
+ SCRATCH 0, 8, tmpq+ 4*%%str
+ mova [tmpq+11*%%str], m1 ; t4:m1->r11
+ UNSCRATCH 0, 10, tmpq+ 0*%%str
+ UNSCRATCH 1, 11, tmpq+15*%%str
+
+ ; round 2 interleaved part 1
+ VP9_UNPACK_MULSUB_2D_4X 0, 1, 6, 7, 16069, 3196 ; m1/7=t8[d], m0/6=t9[d]
+ VP9_UNPACK_MULSUB_2D_4X 3, 2, 5, 4, 3196, 16069 ; m3/5=t12[d], m2/4=t13[d]
+ SCRATCH 4, 9, tmpq+ 3*%%str
+ VP9_RND_SH_SUMSUB_BA 3, 1, 5, 7, 4, [pd_8192] ; m3=t8[w], m1=t12[w]
+ UNSCRATCH 4, 9, tmpq+ 3*%%str
+ VP9_RND_SH_SUMSUB_BA 2, 0, 4, 6, 5, [pd_8192] ; m2=t9[w], m0=t13[w]
+
+ SCRATCH 0, 10, tmpq+ 0*%%str
+ SCRATCH 1, 11, tmpq+15*%%str
+ SCRATCH 2, 14, tmpq+ 3*%%str
+ SCRATCH 3, 15, tmpq+12*%%str
+
+ mova m2, [%1+ 6*32] ; in6
+ mova m3, [%1+ 9*32] ; in9
+ mova m0, [%1+ 1*32] ; in1
+ mova m1, [%1+14*32] ; in14
+
+ VP9_UNPACK_MULSUB_2D_4X 3, 2, 7, 6, 13160, 9760 ; m3/7=t7[d], m2/6=t6[d]
+ VP9_UNPACK_MULSUB_2D_4X 0, 1, 4, 5, 2404, 16207 ; m0/4=t15[d], m1/5=t14[d]
+ SCRATCH 4, 9, tmpq+ 6*%%str
+ VP9_RND_SH_SUMSUB_BA 1, 2, 5, 6, 4, [pd_8192] ; m1=t6[w], m2=t14[w]
+ UNSCRATCH 4, 9, tmpq+ 6*%%str
+ VP9_RND_SH_SUMSUB_BA 0, 3, 4, 7, 6, [pd_8192] ; m0=t7[w], m3=t15[w]
+
+ ; r8=t0, r7=t1, r5=t2, r10=t3, r11=t4, m8|r4=t5, m1=t6, m0=t7
+ ; m10|r0=t8, m11|r15=t9, m13|r13=t10, m12|r2=t11, m14|r3=t12, m15|r12=t13, m2=t14, m3=t15
+
+ UNSCRATCH 4, 12, tmpq+ 2*%%str
+ UNSCRATCH 5, 13, tmpq+13*%%str
+ SCRATCH 0, 12, tmpq+ 1*%%str
+ SCRATCH 1, 13, tmpq+14*%%str
+
+ ; remainder of round 2 (rest of t8-15)
+ VP9_UNPACK_MULSUB_2D_4X 5, 4, 6, 7, 9102, 13623 ; m5/6=t11[d], m4/7=t10[d]
+ VP9_UNPACK_MULSUB_2D_4X 3, 2, 1, 0, 13623, 9102 ; m3/1=t14[d], m2/0=t15[d]
+ SCRATCH 0, 9, tmpq+ 6*%%str
+ VP9_RND_SH_SUMSUB_BA 3, 4, 1, 7, 0, [pd_8192] ; m3=t10[w], m4=t14[w]
+ UNSCRATCH 0, 9, tmpq+ 6*%%str
+ VP9_RND_SH_SUMSUB_BA 2, 5, 0, 6, 1, [pd_8192] ; m2=t11[w], m5=t15[w]
+
+ ; m15|r12=t8, m14|r3=t9, m3=t10, m2=t11, m11|r15=t12, m10|r0=t13, m4=t14, m5=t15
+
+ UNSCRATCH 6, 14, tmpq+ 3*%%str
+ UNSCRATCH 7, 15, tmpq+12*%%str
+
+ SUMSUB_BA w, 3, 7, 1
+ PSIGNW m3, [pw_m1] ; m3=out1[w], m7=t10[w]
+ SUMSUB_BA w, 2, 6, 1 ; m2=out14[w], m6=t11[w]
+
+ ; unfortunately, the code below overflows in some cases, e.g.
+ ; http://downloads.webmproject.org/test_data/libvpx/vp90-2-14-resize-fp-tiles-16-8.webm
+%if 0; cpuflag(ssse3)
+ SUMSUB_BA w, 7, 6, 1
+ pmulhrsw m7, [pw_11585x2] ; m7=out6[w]
+ pmulhrsw m6, [pw_11585x2] ; m6=out9[w]
+%else
+ VP9_UNPACK_MULSUB_2W_4X 6, 7, 11585, 11585, [pd_8192], 1, 0
+%endif
+
+ mova [tmpq+ 3*%%str], m6
+ mova [tmpq+ 6*%%str], m7
+ UNSCRATCH 6, 10, tmpq+ 0*%%str
+ UNSCRATCH 7, 11, tmpq+15*%%str
+ mova [tmpq+13*%%str], m2
+ SCRATCH 3, 11, tmpq+ 9*%%str
+
+ VP9_UNPACK_MULSUB_2D_4X 7, 6, 2, 3, 15137, 6270 ; m6/3=t13[d], m7/2=t12[d]
+ VP9_UNPACK_MULSUB_2D_4X 5, 4, 1, 0, 6270, 15137 ; m5/1=t14[d], m4/0=t15[d]
+ SCRATCH 0, 9, tmpq+ 2*%%str
+ VP9_RND_SH_SUMSUB_BA 5, 6, 1, 3, 0, [pd_8192] ; m5=out2[w], m6=t14[w]
+ UNSCRATCH 0, 9, tmpq+ 2*%%str
+ VP9_RND_SH_SUMSUB_BA 4, 7, 0, 2, 1, [pd_8192]
+ PSIGNW m4, [pw_m1] ; m4=out13[w], m7=t15[w]
+
+ ; unfortunately, the code below overflows in some cases
+%if 0; cpuflag(ssse3)
+ SUMSUB_BA w, 7, 6, 1
+ pmulhrsw m7, [pw_m11585x2] ; m7=out5[w]
+ pmulhrsw m6, [pw_11585x2] ; m6=out10[w]
+%else
+ PSIGNW m7, [pw_m1]
+ VP9_UNPACK_MULSUB_2W_4X 7, 6, 11585, 11585, [pd_8192], 1, 0
+%endif
+
+ ; m11|r13=out1, m5=out2, m7=out5, r15=out6, r3=out9, m6=out10, m4=out13, r2=out14
+
+ mova m2, [tmpq+ 8*%%str]
+ mova m3, [tmpq+ 7*%%str]
+ mova m1, [tmpq+11*%%str]
+ mova [tmpq+ 7*%%str], m6
+ mova [tmpq+11*%%str], m4
+ mova m4, [tmpq+ 5*%%str]
+ SCRATCH 5, 14, tmpq+ 5*%%str
+ SCRATCH 7, 15, tmpq+ 8*%%str
+ UNSCRATCH 6, 8, tmpq+ 4*%%str
+ UNSCRATCH 5, 12, tmpq+ 1*%%str
+ UNSCRATCH 7, 13, tmpq+14*%%str
+
+ ; m2=t0, m3=t1, m9=t2, m0=t3, m1=t4, m8=t5, m13=t6, m12=t7
+ ; m11|r13=out1, m5=out2, m7=out5, r15=out6, r3=out9, r10=out10, r11=out13, r2=out14
+
+ SUMSUB_BA w, 1, 2, 0 ; m1=t0[w], m2=t4[w]
+ mova m0, [tmpq+10*%%str]
+ SCRATCH 1, 12, tmpq+ 1*%%str
+ SUMSUB_BA w, 6, 3, 1 ; m8=t1[w], m3=t5[w]
+ SCRATCH 6, 13, tmpq+ 4*%%str
+ SUMSUB_BA w, 7, 4, 1 ; m13=t2[w], m9=t6[w]
+ SCRATCH 7, 8, tmpq+10*%%str
+ SUMSUB_BA w, 5, 0, 1 ; m12=t3[w], m0=t7[w]
+ SCRATCH 5, 9, tmpq+14*%%str
+
+ VP9_UNPACK_MULSUB_2D_4X 2, 3, 7, 5, 15137, 6270 ; m2/6=t5[d], m3/10=t4[d]
+ VP9_UNPACK_MULSUB_2D_4X 0, 4, 1, 6, 6270, 15137 ; m0/14=t6[d], m9/15=t7[d]
+ SCRATCH 6, 10, tmpq+ 0*%%str
+ VP9_RND_SH_SUMSUB_BA 0, 3, 1, 5, 6, [pd_8192]
+ UNSCRATCH 6, 10, tmpq+ 0*%%str
+ PSIGNW m0, [pw_m1] ; m0=out3[w], m3=t6[w]
+ VP9_RND_SH_SUMSUB_BA 4, 2, 6, 7, 5, [pd_8192] ; m9=out12[w], m2=t7[w]
+
+ UNSCRATCH 1, 8, tmpq+10*%%str
+ UNSCRATCH 5, 9, tmpq+14*%%str
+ UNSCRATCH 6, 12, tmpq+ 1*%%str
+ UNSCRATCH 7, 13, tmpq+ 4*%%str
+ SCRATCH 4, 9, tmpq+14*%%str
+
+ SUMSUB_BA w, 1, 6, 4 ; m13=out0[w], m1=t2[w]
+ SUMSUB_BA w, 5, 7, 4
+ PSIGNW m5, [pw_m1] ; m12=out15[w], m8=t3[w]
+
+ ; unfortunately, the code below overflows in some cases, e.g.
+ ; http://downloads.webmproject.org/test_data/libvpx/vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm
+%if 0 ; cpuflag(ssse3)
+ SUMSUB_BA w, 7, 6, 4
+ pmulhrsw m7, [pw_m11585x2] ; m8=out7[w]
+ pmulhrsw m6, [pw_11585x2] ; m1=out8[w]
+ SWAP 6, 7
+ SUMSUB_BA w, 3, 2, 4
+ pmulhrsw m3, [pw_11585x2] ; m3=out4[w]
+ pmulhrsw m2, [pw_11585x2] ; m2=out11[w]
+%else
+ SCRATCH 5, 8, tmpq+10*%%str
+ VP9_UNPACK_MULSUB_2W_4X 6, 7, 11585, m11585, [pd_8192], 5, 4
+ VP9_UNPACK_MULSUB_2W_4X 2, 3, 11585, 11585, [pd_8192], 5, 4
+ UNSCRATCH 5, 8, tmpq+10*%%str
+%endif
+
+ ; m13=out0, m0=out3, m3=out4, m8=out7, m1=out8, m2=out11, m9=out12, m12=out15
+ ; m11|r13=out1, m5=out2, m7=out5, r15=out6, r3=out9, r10=out10, r11=out13, r2=out14
+
+%if %2 == 1
+%if ARCH_X86_64
+ mova m13, [tmpq+ 6*%%str]
+ TRANSPOSE8x8W 1, 11, 14, 0, 3, 15, 13, 6, 10
+ mova [tmpq+ 0*16], m1
+ mova [tmpq+ 2*16], m11
+ mova [tmpq+ 4*16], m14
+ mova [tmpq+ 6*16], m0
+ mova m1, [tmpq+ 3*%%str]
+ mova m11, [tmpq+ 7*%%str]
+ mova m14, [tmpq+11*%%str]
+ mova m0, [tmpq+13*%%str]
+ mova [tmpq+ 8*16], m3
+ mova [tmpq+10*16], m15
+ mova [tmpq+12*16], m13
+ mova [tmpq+14*16], m6
+
+ TRANSPOSE8x8W 7, 1, 11, 2, 9, 14, 0, 5, 10
+ mova [tmpq+ 1*16], m7
+ mova [tmpq+ 3*16], m1
+ mova [tmpq+ 5*16], m11
+ mova [tmpq+ 7*16], m2
+ mova [tmpq+ 9*16], m9
+ mova [tmpq+11*16], m14
+ mova [tmpq+13*16], m0
+ mova [tmpq+15*16], m5
+%else
+ mova [tmpq+12*%%str], m2
+ mova [tmpq+ 1*%%str], m5
+ mova [tmpq+15*%%str], m7
+ mova m2, [tmpq+ 9*%%str]
+ mova m5, [tmpq+ 5*%%str]
+ mova m7, [tmpq+ 8*%%str]
+ TRANSPOSE8x8W 1, 2, 5, 0, 3, 7, 4, 6, [tmpq+ 6*%%str], [tmpq+ 8*%%str], 1
+ mova [tmpq+ 0*16], m1
+ mova [tmpq+ 2*16], m2
+ mova [tmpq+ 4*16], m5
+ mova [tmpq+ 6*16], m0
+ mova [tmpq+10*16], m7
+ mova m3, [tmpq+12*%%str]
+ mova [tmpq+12*16], m4
+ mova m4, [tmpq+14*%%str]
+ mova [tmpq+14*16], m6
+
+ mova m0, [tmpq+15*%%str]
+ mova m1, [tmpq+ 3*%%str]
+ mova m2, [tmpq+ 7*%%str]
+ mova m5, [tmpq+11*%%str]
+ mova m7, [tmpq+ 1*%%str]
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [tmpq+13*%%str], [tmpq+ 9*%%str], 1
+ mova [tmpq+ 1*16], m0
+ mova [tmpq+ 3*16], m1
+ mova [tmpq+ 5*16], m2
+ mova [tmpq+ 7*16], m3
+ mova [tmpq+11*16], m5
+ mova [tmpq+13*16], m6
+ mova [tmpq+15*16], m7
+%endif
+%else
+ pxor m4, m4
+
+%if cpuflag(ssse3)
+%define ROUND_REG [pw_512]
+%else
+%define ROUND_REG [pw_32]
+%endif
+
+%if ARCH_X86_64
+ mova m12, [tmpq+ 6*%%str]
+ VP9_IDCT8_WRITEx2 1, 11, 10, 8, 4, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+ VP9_IDCT8_WRITEx2 14, 0, 10, 8, 4, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+ VP9_IDCT8_WRITEx2 3, 15, 10, 8, 4, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+ VP9_IDCT8_WRITEx2 12, 6, 10, 8, 4, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+
+ mova m1, [tmpq+ 3*%%str]
+ mova m11, [tmpq+ 7*%%str]
+ mova m14, [tmpq+11*%%str]
+ mova m0, [tmpq+13*%%str]
+
+ VP9_IDCT8_WRITEx2 7, 1, 10, 8, 4, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+ VP9_IDCT8_WRITEx2 11, 2, 10, 8, 4, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+ VP9_IDCT8_WRITEx2 9, 14, 10, 8, 4, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+ VP9_IDCT8_WRITEx2 0, 5, 10, 8, 4, ROUND_REG, 6
+%else
+ mova [tmpq+ 0*%%str], m2
+ mova [tmpq+ 1*%%str], m5
+ mova [tmpq+ 2*%%str], m7
+ mova m2, [tmpq+ 9*%%str]
+ VP9_IDCT8_WRITEx2 1, 2, 5, 7, 4, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+ mova m5, [tmpq+ 5*%%str]
+ VP9_IDCT8_WRITEx2 5, 0, 1, 2, 4, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+ mova m5, [tmpq+ 8*%%str]
+ VP9_IDCT8_WRITEx2 3, 5, 1, 2, 4, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+ mova m5, [tmpq+ 6*%%str]
+ VP9_IDCT8_WRITEx2 5, 6, 1, 2, 4, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+
+ mova m0, [tmpq+ 2*%%str]
+ mova m3, [tmpq+ 3*%%str]
+ VP9_IDCT8_WRITEx2 0, 3, 1, 2, 4, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+ mova m0, [tmpq+ 7*%%str]
+ mova m3, [tmpq+ 0*%%str]
+ VP9_IDCT8_WRITEx2 0, 3, 1, 2, 4, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+ mova m0, [tmpq+14*%%str]
+ mova m3, [tmpq+11*%%str]
+ VP9_IDCT8_WRITEx2 0, 3, 1, 2, 4, ROUND_REG, 6
+ lea dstq, [dstq+strideq*2]
+ mova m0, [tmpq+13*%%str]
+ mova m3, [tmpq+ 1*%%str]
+ VP9_IDCT8_WRITEx2 0, 3, 1, 2, 4, ROUND_REG, 6
+%endif
+
+ SWAP 0, 4 ; zero
+%undef ROUND_REG
+%endif
+%endmacro
+
+%macro IADST16_FN 5
+INIT_XMM %5
+cglobal vp9_%1_%3_16x16_add, 3, 6, 16, 512, dst, stride, block, cnt, dst_bak, tmp
+ mov cntd, 2
+ mov tmpq, rsp
+.loop1_full:
+ VP9_%2_1D blockq, 1
+ add blockq, 16
+ add tmpq, 256
+ dec cntd
+ jg .loop1_full
+ sub blockq, 32
+
+ mov cntd, 2
+ mov tmpq, rsp
+ mov dst_bakq, dstq
+.loop2_full:
+ VP9_%4_1D tmpq, 2
+ lea dstq, [dst_bakq+8]
+ add tmpq, 16
+ dec cntd
+ jg .loop2_full
+
+ ; at the end of the loop, m0 should still be zero
+ ; use that to zero out block coefficients
+ ZERO_BLOCK blockq, 32, 16, m0
+ RET
+%endmacro
+
+IADST16_FN idct, IDCT16, iadst, IADST16, sse2
+IADST16_FN iadst, IADST16, idct, IDCT16, sse2
+IADST16_FN iadst, IADST16, iadst, IADST16, sse2
+IADST16_FN idct, IDCT16, iadst, IADST16, ssse3
+IADST16_FN iadst, IADST16, idct, IDCT16, ssse3
+IADST16_FN iadst, IADST16, iadst, IADST16, ssse3
+IADST16_FN idct, IDCT16, iadst, IADST16, avx
+IADST16_FN iadst, IADST16, idct, IDCT16, avx
+IADST16_FN iadst, IADST16, iadst, IADST16, avx
+
+; in: data in m[0-15] except m0/m4, which are in [blockq+0] and [blockq+128]
+; out: m[0-15] except m6, which is in [blockq+192]
+; uses blockq as scratch space
+%macro VP9_IADST16_YMM_1D 0
+ mova [blockq+ 32], m3
+ mova [blockq+ 64], m7
+ mova [blockq+ 96], m8
+
+ ; first half of round 1
+ VP9_UNPACK_MULSUB_2D_4X 9, 6, 0, 3, 13160, 9760 ; m9/x=t7[d], m6/x=t6[d]
+ VP9_UNPACK_MULSUB_2D_4X 1, 14, 4, 7, 2404, 16207 ; m1/x=t15[d], m14/x=t14[d]
+ VP9_RND_SH_SUMSUB_BA 14, 6, 7, 3, 8, [pd_8192] ; m14=t6[w], m6=t14[w]
+ VP9_RND_SH_SUMSUB_BA 1, 9, 4, 0, 8, [pd_8192] ; m1=t7[w], m9=t15[w]
+
+ VP9_UNPACK_MULSUB_2D_4X 13, 2, 4, 7, 15893, 3981 ; m13/x=t3[d], m2/x=t2[d]
+ VP9_UNPACK_MULSUB_2D_4X 5, 10, 0, 3, 8423, 14053 ; m5/x=t11[d], m10/x=t10[d]
+ VP9_RND_SH_SUMSUB_BA 10, 2, 3, 7, 8, [pd_8192] ; m10=t2[w], m2=t10[w]
+ VP9_RND_SH_SUMSUB_BA 5, 13, 0, 4, 8, [pd_8192] ; m5=t3[w], m13=t11[w]
+
+ ; half of round 2 t8-15
+ VP9_UNPACK_MULSUB_2D_4X 2, 13, 4, 7, 9102, 13623 ; m2/x=t11[d], m13/x=t10[d]
+ VP9_UNPACK_MULSUB_2D_4X 9, 6, 3, 0, 13623, 9102 ; m9/x=t14[d], m6/x=t15[d]
+ VP9_RND_SH_SUMSUB_BA 9, 13, 3, 7, 8, [pd_8192] ; m9=t10[w], m13=t14[w]
+ VP9_RND_SH_SUMSUB_BA 6, 2, 0, 4, 8, [pd_8192] ; m6=t11[w], m2=t15[w]
+
+ SUMSUB_BA w, 14, 10, 8 ; m14=t2, m10=t6
+ SUMSUB_BA w, 1, 5, 8 ; m1=t3, m5=t7
+
+ mova m0, [blockq+ 0]
+ mova m4, [blockq+128]
+ mova m3, [blockq+ 32]
+ mova m7, [blockq+ 64]
+ mova m8, [blockq+ 96]
+ mova [blockq+ 0], m1
+ mova [blockq+128], m14
+ mova [blockq+ 32], m6
+ mova [blockq+ 64], m9
+ mova [blockq+ 96], m10
+
+ ; second half of round 1
+ VP9_UNPACK_MULSUB_2D_4X 15, 0, 1, 9, 16364, 804 ; m15/x=t1[d], m0/x=t0[d]
+ VP9_UNPACK_MULSUB_2D_4X 7, 8, 10, 6, 11003, 12140 ; m7/x=t9[d], m8/x=t8[d]
+ VP9_RND_SH_SUMSUB_BA 8, 0, 6, 9, 14, [pd_8192] ; m8=t0[w], m0=t8[w]
+ VP9_RND_SH_SUMSUB_BA 7, 15, 10, 1, 14, [pd_8192] ; m7=t1[w], m15=t9[w]
+
+ VP9_UNPACK_MULSUB_2D_4X 11, 4, 10, 6, 14811, 7005 ; m11/x=t5[d], m4/x=t4[d]
+ VP9_UNPACK_MULSUB_2D_4X 3, 12, 1, 9, 5520, 15426 ; m3/x=t13[d], m12/x=t12[d]
+ VP9_RND_SH_SUMSUB_BA 12, 4, 9, 6, 14, [pd_8192] ; m12=t4[w], m4=t12[w]
+ VP9_RND_SH_SUMSUB_BA 3, 11, 1, 10, 14, [pd_8192] ; m3=t5[w], m11=t13[w]
+
+ ; second half of round 2 t8-15
+ VP9_UNPACK_MULSUB_2D_4X 0, 15, 6, 10, 16069, 3196 ; m15/x=t8[d], m0/x=t9[d]
+ VP9_UNPACK_MULSUB_2D_4X 11, 4, 9, 1, 3196, 16069 ; m11/x=t12[d], m4/x=t13[d]
+ VP9_RND_SH_SUMSUB_BA 11, 15, 9, 10, 14, [pd_8192] ; m11=t8[w], m15=t12[w]
+ VP9_RND_SH_SUMSUB_BA 4, 0, 1, 6, 14, [pd_8192] ; m4=t9[w], m0=t13[w]
+
+ SUMSUB_BA w, 12, 8, 14 ; m12=t0, m8=t4
+ SUMSUB_BA w, 3, 7, 14 ; m3=t1, m7=t5
+
+ mova m10, [blockq+ 96]
+ mova [blockq+ 96], m12
+
+ ; round 3
+ VP9_UNPACK_MULSUB_2D_4X 15, 0, 9, 12, 15137, 6270 ; m15/x=t13[d], m0/x=t12[d]
+ VP9_UNPACK_MULSUB_2D_4X 2, 13, 1, 6, 6270, 15137 ; m2/x=t14[d], m13/x=t15[d]
+ VP9_RND_SH_SUMSUB_BA 2, 0, 1, 12, 14, [pd_8192] ; m2=out2[w], m0=t14a[w]
+ VP9_RND_SH_SUMSUB_BA 13, 15, 6, 9, 14, [pd_8192]
+ PSIGNW m13, [pw_m1] ; m13=out13[w], m15=t15a[w]
+
+ VP9_UNPACK_MULSUB_2D_4X 8, 7, 12, 9, 15137, 6270 ; m8/x=t5[d], m7/x=t4[d]
+ VP9_UNPACK_MULSUB_2D_4X 5, 10, 1, 6, 6270, 15137 ; m5/x=t6[d], m10/x=t7[d]
+ VP9_RND_SH_SUMSUB_BA 5, 7, 1, 9, 14, [pd_8192]
+ PSIGNW m5, [pw_m1] ; m5=out3[w], m7=t6[w]
+ VP9_RND_SH_SUMSUB_BA 10, 8, 6, 12, 14, [pd_8192] ; m10=out12[w], m8=t7[w]
+
+ mova m1, [blockq+ 0]
+ mova m14, [blockq+128]
+ mova m6, [blockq+ 32]
+ mova m9, [blockq+ 64]
+ mova m12, [blockq+ 96]
+ mova [blockq+ 0], m10
+ mova [blockq+128], m5
+
+ SUMSUB_BA w, 14, 12, 5 ; m14=out0, m12=t2a
+ SUMSUB_BA w, 1, 3, 5
+ PSIGNW m1, [pw_m1] ; m1=out15, m3=t3a
+
+ SUMSUB_BA w, 9, 11, 5
+ PSIGNW m9, [pw_m1] ; m9=out1, m11=t10
+ SUMSUB_BA w, 6, 4, 5 ; m6=out14, m4=t11
+
+ VP9_UNPACK_MULSUB_2W_4X 4, 11, 11585, 11585, [pd_8192], 5, 10 ; m4=out9, m11=out6
+ mova m5, [blockq+128]
+ mova [blockq+192], m11
+ PSIGNW m15, [pw_m1]
+ VP9_UNPACK_MULSUB_2W_4X 15, 0, 11585, 11585, [pd_8192], 10, 11 ; m15=out5, m0=out10
+
+ PSIGNW m3, [pw_m1]
+ VP9_UNPACK_MULSUB_2W_4X 3, 12, 11585, 11585, [pd_8192], 10, 11 ; m3=out7,m12=out8
+ VP9_UNPACK_MULSUB_2W_4X 8, 7, 11585, 11585, [pd_8192], 10, 11 ; m8=out11,m7=out4
+
+ mova m10, [blockq+ 0]
+
+ SWAP 0, 14, 6, 11, 8, 12, 10
+ SWAP 1, 9, 15, 4, 7, 3, 5
+ SWAP 5, 9, 15
+%endmacro
+
+%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
+%macro IADST16_YMM_FN 4
+INIT_YMM avx2
+cglobal vp9_%1_%3_16x16_add, 4, 4, 16, dst, stride, block, eob
+ mova m1, [blockq+ 32]
+ mova m2, [blockq+ 64]
+ mova m3, [blockq+ 96]
+ mova m5, [blockq+160]
+ mova m6, [blockq+192]
+ mova m7, [blockq+224]
+ mova m8, [blockq+256]
+ mova m9, [blockq+288]
+ mova m10, [blockq+320]
+ mova m11, [blockq+352]
+ mova m12, [blockq+384]
+ mova m13, [blockq+416]
+ mova m14, [blockq+448]
+ mova m15, [blockq+480]
+
+ VP9_%2_YMM_1D
+ TRANSPOSE16x16W 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
+ [blockq+192], [blockq+128], 1
+ mova [blockq+ 0], m0
+ VP9_%4_YMM_1D
+
+ mova [blockq+224], m7
+
+ ; store
+ VP9_IDCT8_WRITEx2 0, 1, 6, 7, unused, [pw_512], 6
+ lea dstq, [dstq+2*strideq]
+ VP9_IDCT8_WRITEx2 2, 3, 6, 7, unused, [pw_512], 6
+ lea dstq, [dstq+2*strideq]
+ VP9_IDCT8_WRITEx2 4, 5, 6, 7, unused, [pw_512], 6
+ lea dstq, [dstq+2*strideq]
+ mova m6, [blockq+192]
+ mova m7, [blockq+224]
+ VP9_IDCT8_WRITEx2 6, 7, 1, 2, unused, [pw_512], 6
+ lea dstq, [dstq+2*strideq]
+ VP9_IDCT8_WRITEx2 8, 9, 1, 2, unused, [pw_512], 6
+ lea dstq, [dstq+2*strideq]
+ VP9_IDCT8_WRITEx2 10, 11, 1, 2, unused, [pw_512], 6
+ lea dstq, [dstq+2*strideq]
+ VP9_IDCT8_WRITEx2 12, 13, 1, 2, unused, [pw_512], 6
+ lea dstq, [dstq+2*strideq]
+ VP9_IDCT8_WRITEx2 14, 15, 1, 2, unused, [pw_512], 6
+ lea dstq, [dstq+2*strideq]
+
+ ; at the end of the loop, m0 should still be zero
+ ; use that to zero out block coefficients
+ pxor m0, m0
+ ZERO_BLOCK blockq, 32, 16, m0
+ RET
+%endmacro
+
+IADST16_YMM_FN idct, IDCT16, iadst, IADST16
+IADST16_YMM_FN iadst, IADST16, idct, IDCT16
+IADST16_YMM_FN iadst, IADST16, iadst, IADST16
+%endif
+
+;---------------------------------------------------------------------------------------------
+; void vp9_idct_idct_32x32_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;---------------------------------------------------------------------------------------------
+
+%macro VP9_IDCT32_1D 2-3 32 ; src, pass, nnzc
+%if %2 == 1
+%assign %%str mmsize
+%else
+%assign %%str 64
+%endif
+
+ ; first do t0-15, this can be done identical to idct16x16
+ VP9_IDCT16_1D_START %1, %3/2, 64*2, tmpq, 2*%%str, 1
+
+ ; store everything on stack to make space available for t16-31
+ ; we store interleaved with the output of the second half (t16-31)
+ ; so we don't need to allocate extra stack space
+ mova [tmpq+ 0*%%str], m0 ; t0
+ mova [tmpq+ 4*%%str], m1 ; t1
+ mova [tmpq+ 8*%%str], m2 ; t2
+ mova [tmpq+12*%%str], m3 ; t3
+ mova [tmpq+16*%%str], m4 ; t4
+ mova [tmpq+20*%%str], m5 ; t5
+%if ARCH_X86_64
+ mova [tmpq+22*%%str], m10 ; t10
+ mova [tmpq+18*%%str], m11 ; t11
+ mova [tmpq+14*%%str], m12 ; t12
+ mova [tmpq+10*%%str], m13 ; t13
+ mova [tmpq+ 6*%%str], m14 ; t14
+ mova [tmpq+ 2*%%str], m15 ; t15
+%endif
+
+ mova m0, [tmpq+ 30*%%str]
+ UNSCRATCH 1, 6, tmpq+26*%%str
+ UNSCRATCH 2, 8, tmpq+24*%%str
+ UNSCRATCH 3, 9, tmpq+28*%%str
+ SUMSUB_BA w, 1, 3, 4 ; t6, t9
+ SUMSUB_BA w, 0, 2, 4 ; t7, t8
+
+ mova [tmpq+24*%%str], m1 ; t6
+ mova [tmpq+28*%%str], m0 ; t7
+ mova [tmpq+30*%%str], m2 ; t8
+ mova [tmpq+26*%%str], m3 ; t9
+
+ ; then, secondly, do t16-31
+%if %3 <= 8
+ mova m4, [%1+ 1*64]
+ mova m7, [%1+ 7*64]
+
+ pmulhrsw m1, m4, [pw_16364x2] ;t31
+ pmulhrsw m4, [pw_804x2] ;t16
+
+ VP9_UNPACK_MULSUB_2W_4X 5, 0, 1, 4, 16069, 3196, [pd_8192], 6, 2 ; t17, t30
+
+ pmulhrsw m3, m7, [pw_m5520x2] ;t19
+ pmulhrsw m7, [pw_15426x2] ;t28
+
+ SCRATCH 4, 13, tmpq+ 1*%%str
+ SCRATCH 5, 12, tmpq+15*%%str
+
+ VP9_UNPACK_MULSUB_2W_4X 2, 6, 7, 3, 3196, m16069, [pd_8192], 4, 5 ; t18, t29
+%else
+ mova m0, [%1+ 1*64]
+ mova m1, [%1+15*64]
+%if %3 <= 16
+ pmulhrsw m5, m0, [pw_16364x2]
+ pmulhrsw m0, [pw_804x2]
+ pmulhrsw m4, m1, [pw_m11003x2]
+ pmulhrsw m1, [pw_12140x2]
+%else
+ mova m4, [%1+17*64]
+ mova m5, [%1+31*64]
+
+ VP9_UNPACK_MULSUB_2W_4X 0, 5, 16364, 804, [pd_8192], 2, 3 ; t16, t31
+ VP9_UNPACK_MULSUB_2W_4X 4, 1, 11003, 12140, [pd_8192], 2, 3 ; t17, t30
+%endif
+ SUMSUB_BA w, 4, 0, 2
+ SUMSUB_BA w, 1, 5, 2
+
+ VP9_UNPACK_MULSUB_2W_4X 5, 0, 16069, 3196, [pd_8192], 2, 3 ; t17, t30
+
+ SCRATCH 4, 13, tmpq+ 1*%%str
+ SCRATCH 5, 12, tmpq+15*%%str
+
+ mova m2, [%1+ 7*64]
+ mova m3, [%1+ 9*64]
+%if %3 <= 16
+ pmulhrsw m7, m3, [pw_14811x2]
+ pmulhrsw m3, [pw_7005x2]
+ pmulhrsw m6, m2, [pw_m5520x2]
+ pmulhrsw m2, [pw_15426x2]
+%else
+ mova m7, [%1+23*64]
+ mova m6, [%1+25*64]
+
+ VP9_UNPACK_MULSUB_2W_4X 3, 7, 14811, 7005, [pd_8192], 4, 5 ; t18, t29
+ VP9_UNPACK_MULSUB_2W_4X 6, 2, 5520, 15426, [pd_8192], 4, 5 ; t19, t28
+%endif
+ SUMSUB_BA w, 3, 6, 4
+ SUMSUB_BA w, 7, 2, 4
+
+ VP9_UNPACK_MULSUB_2W_4X 2, 6, 3196, m16069, [pd_8192], 4, 5 ; t18, t29
+%endif
+
+ UNSCRATCH 5, 12, tmpq+15*%%str
+ SUMSUB_BA w, 6, 0, 4
+ mova [tmpq+25*%%str], m6 ; t19
+ UNSCRATCH 4, 13, tmpq+ 1*%%str
+ SUMSUB_BA w, 7, 1, 6
+ SUMSUB_BA w, 3, 4, 6
+ mova [tmpq+23*%%str], m3 ; t16
+ SUMSUB_BA w, 2, 5, 6
+
+ VP9_UNPACK_MULSUB_2W_4X 0, 5, 15137, 6270, [pd_8192], 6, 3 ; t18, t29
+ VP9_UNPACK_MULSUB_2W_4X 1, 4, 15137, 6270, [pd_8192], 6, 3 ; t19, t28
+
+ SCRATCH 0, 10, tmpq+ 1*%%str
+ SCRATCH 1, 11, tmpq+ 7*%%str
+ SCRATCH 2, 9, tmpq+ 9*%%str
+ SCRATCH 4, 14, tmpq+15*%%str
+ SCRATCH 5, 15, tmpq+17*%%str
+ SCRATCH 7, 13, tmpq+31*%%str
+
+%if %3 <= 8
+ mova m0, [%1+ 5*64]
+ mova m3, [%1+ 3*64]
+
+ pmulhrsw m5, m0, [pw_15893x2] ;t27
+ pmulhrsw m0, [pw_3981x2] ;t20
+
+ VP9_UNPACK_MULSUB_2W_4X 1, 4, 5, 0, 9102, 13623, [pd_8192], 7, 2 ; t21, t26
+
+ pmulhrsw m6, m3, [pw_m2404x2] ;t23
+ pmulhrsw m3, [pw_16207x2] ;t24
+
+ SCRATCH 5, 8, tmpq+ 5*%%str
+ SCRATCH 4, 12, tmpq+11*%%str
+
+ VP9_UNPACK_MULSUB_2W_4X 7, 2, 3, 6, 13623, m9102, [pd_8192], 4, 5 ; t22, t25
+%else
+ mova m4, [%1+ 5*64]
+ mova m5, [%1+11*64]
+%if %3 <= 16
+ pmulhrsw m1, m4, [pw_15893x2]
+ pmulhrsw m4, [pw_3981x2]
+ pmulhrsw m0, m5, [pw_m8423x2]
+ pmulhrsw m5, [pw_14053x2]
+%else
+ mova m0, [%1+21*64]
+ mova m1, [%1+27*64]
+
+ VP9_UNPACK_MULSUB_2W_4X 4, 1, 15893, 3981, [pd_8192], 2, 3 ; t20, t27
+ VP9_UNPACK_MULSUB_2W_4X 0, 5, 8423, 14053, [pd_8192], 2, 3 ; t21, t26
+%endif
+ SUMSUB_BA w, 0, 4, 2
+ SUMSUB_BA w, 5, 1, 2
+
+ VP9_UNPACK_MULSUB_2W_4X 1, 4, 9102, 13623, [pd_8192], 2, 3 ; t21, t26
+
+ SCRATCH 5, 8, tmpq+ 5*%%str
+ SCRATCH 4, 12, tmpq+11*%%str
+
+ mova m7, [%1+ 3*64]
+ mova m6, [%1+13*64]
+%if %3 <= 16
+ pmulhrsw m3, m6, [pw_13160x2]
+ pmulhrsw m6, [pw_9760x2]
+ pmulhrsw m2, m7, [pw_m2404x2]
+ pmulhrsw m7, [pw_16207x2]
+%else
+ mova m2, [%1+29*64]
+ mova m3, [%1+19*64]
+ VP9_UNPACK_MULSUB_2W_4X 6, 3, 13160, 9760, [pd_8192], 4, 5 ; t22, t25
+ VP9_UNPACK_MULSUB_2W_4X 2, 7, 2404, 16207, [pd_8192], 4, 5 ; t23, t24
+%endif
+ SUMSUB_BA w, 6, 2, 4
+ SUMSUB_BA w, 3, 7, 4
+
+ VP9_UNPACK_MULSUB_2W_4X 7, 2, 13623, m9102, [pd_8192], 4, 5 ; t22, t25
+%endif
+
+ ; m4=t16, m5=t17, m9=t18, m8=t19, m0=t20, m1=t21, m13=t22, m12=t23,
+ ; m3=t24, m2=t25, m14=t26, m15=t27, m7=t28, m6=t29, m10=t30, m11=t31
+
+ UNSCRATCH 4, 12, tmpq+11*%%str
+ SUMSUB_BA w, 0, 6, 5
+ SUMSUB_BA w, 4, 2, 5
+ UNSCRATCH 5, 8, tmpq+ 5*%%str
+ SCRATCH 4, 8, tmpq+11*%%str
+ SUMSUB_BA w, 1, 7, 4
+ SUMSUB_BA w, 5, 3, 4
+ SCRATCH 5, 12, tmpq+ 5*%%str
+
+ VP9_UNPACK_MULSUB_2W_4X 3, 6, 6270, m15137, [pd_8192], 4, 5 ; t20, t27
+ VP9_UNPACK_MULSUB_2W_4X 2, 7, 6270, m15137, [pd_8192], 4, 5 ; t21, t26
+
+ ; m8[s]=t16, m9=t17, m5=t18, m4[s]=t19, m12=t20, m13=t21, m1=t22, m0=t23,
+ ; m15=t24, m14=t25, m2=t26, m3=t27, m11=t28, m10=t29, m6=t30, m7=t31
+
+ UNSCRATCH 5, 9, tmpq+ 9*%%str
+ mova m4, [tmpq+23*%%str] ; t16
+%if ARCH_X86_64
+ SUMSUB_BA w, 1, 5, 9
+ SUMSUB_BA w, 0, 4, 9
+%else
+ SUMSUB_BADC w, 1, 5, 0, 4
+%endif
+ mova [tmpq+29*%%str], m1 ; t17
+ mova [tmpq+21*%%str], m0 ; t16
+ UNSCRATCH 0, 10, tmpq+ 1*%%str
+ UNSCRATCH 1, 11, tmpq+ 7*%%str
+%if ARCH_X86_64
+ SUMSUB_BA w, 2, 0, 9
+ SUMSUB_BA w, 3, 1, 9
+%else
+ SUMSUB_BADC w, 2, 0, 3, 1
+%endif
+ mova [tmpq+ 9*%%str], m2 ; t18
+ mova [tmpq+13*%%str], m3 ; t19
+ SCRATCH 0, 10, tmpq+23*%%str
+ SCRATCH 1, 11, tmpq+27*%%str
+
+ UNSCRATCH 2, 14, tmpq+15*%%str
+ UNSCRATCH 3, 15, tmpq+17*%%str
+ SUMSUB_BA w, 6, 2, 0
+ SUMSUB_BA w, 7, 3, 0
+ SCRATCH 6, 14, tmpq+ 3*%%str
+ SCRATCH 7, 15, tmpq+ 7*%%str
+
+ UNSCRATCH 0, 8, tmpq+11*%%str
+ mova m1, [tmpq+25*%%str] ; t19
+ UNSCRATCH 6, 12, tmpq+ 5*%%str
+ UNSCRATCH 7, 13, tmpq+31*%%str
+%if ARCH_X86_64
+ SUMSUB_BA w, 0, 1, 9
+ SUMSUB_BA w, 6, 7, 9
+%else
+ SUMSUB_BADC w, 0, 1, 6, 7
+%endif
+
+ ; m0=t16, m1=t17, m2=t18, m3=t19, m11=t20, m10=t21, m9=t22, m8=t23,
+ ; m7=t24, m6=t25, m5=t26, m4=t27, m12=t28, m13=t29, m14=t30, m15=t31
+
+%if 0; cpuflag(ssse3)
+%if ARCH_X86_64
+ SUMSUB_BA w, 4, 7, 8
+ SUMSUB_BA w, 5, 1, 8
+%else
+ SUMSUB_BADC w, 4, 7, 5, 1
+%endif
+
+ pmulhrsw m7, [pw_11585x2]
+ pmulhrsw m4, [pw_11585x2]
+ pmulhrsw m1, [pw_11585x2]
+ pmulhrsw m5, [pw_11585x2]
+
+ mova [tmpq+ 5*%%str], m7 ; t23
+ SCRATCH 1, 13, tmpq+25*%%str
+ UNSCRATCH 7, 10, tmpq+23*%%str
+ UNSCRATCH 1, 11, tmpq+27*%%str
+
+%if ARCH_X86_64
+ SUMSUB_BA w, 7, 3, 10
+ SUMSUB_BA w, 1, 2, 10
+%else
+ SUMSUB_BADC w, 7, 3, 1, 2
+%endif
+
+ pmulhrsw m3, [pw_11585x2]
+ pmulhrsw m7, [pw_11585x2]
+ pmulhrsw m2, [pw_11585x2]
+ pmulhrsw m1, [pw_11585x2]
+%else
+ SCRATCH 0, 8, tmpq+15*%%str
+ SCRATCH 6, 9, tmpq+17*%%str
+ VP9_UNPACK_MULSUB_2W_4X 7, 4, 11585, 11585, [pd_8192], 0, 6
+ mova [tmpq+ 5*%%str], m7 ; t23
+ UNSCRATCH 7, 10, tmpq+23*%%str
+ VP9_UNPACK_MULSUB_2W_4X 1, 5, 11585, 11585, [pd_8192], 0, 6
+ SCRATCH 1, 13, tmpq+25*%%str
+ UNSCRATCH 1, 11, tmpq+27*%%str
+ VP9_UNPACK_MULSUB_2W_4X 3, 7, 11585, 11585, [pd_8192], 0, 6
+ VP9_UNPACK_MULSUB_2W_4X 2, 1, 11585, 11585, [pd_8192], 0, 6
+ UNSCRATCH 0, 8, tmpq+15*%%str
+ UNSCRATCH 6, 9, tmpq+17*%%str
+%endif
+
+ ; m0=t16, m1=t17, m2=t18, m3=t19, m4=t20, m5=t21, m6=t22, m7=t23,
+ ; m8=t24, m9=t25, m10=t26, m11=t27, m12=t28, m13=t29, m14=t30, m15=t31
+
+ ; then do final pass to sumsub+store the two halves
+%if %2 == 1
+ mova [tmpq+17*%%str], m2 ; t20
+ mova [tmpq+ 1*%%str], m3 ; t21
+%if ARCH_X86_64
+ mova [tmpq+25*%%str], m13 ; t22
+
+ mova m8, [tmpq+ 0*%%str] ; t0
+ mova m9, [tmpq+ 4*%%str] ; t1
+ mova m12, [tmpq+ 8*%%str] ; t2
+ mova m11, [tmpq+12*%%str] ; t3
+ mova m2, [tmpq+16*%%str] ; t4
+ mova m3, [tmpq+20*%%str] ; t5
+ mova m13, [tmpq+24*%%str] ; t6
+
+ SUMSUB_BA w, 6, 8, 10
+ mova [tmpq+ 3*%%str], m8 ; t15
+ SUMSUB_BA w, 0, 9, 8
+ SUMSUB_BA w, 15, 12, 8
+ SUMSUB_BA w, 14, 11, 8
+ SUMSUB_BA w, 1, 2, 8
+ SUMSUB_BA w, 7, 3, 8
+ SUMSUB_BA w, 5, 13, 8
+ mova m10, [tmpq+28*%%str] ; t7
+ SUMSUB_BA w, 4, 10, 8
+%if cpuflag(avx2)
+ ; the "shitty" about this idct is that the final pass does the outermost
+ ; interleave sumsubs (t0/31, t1/30, etc) but the tN for the 16x16 need
+ ; to be sequential, which means I need to load/store half of the sumsub
+ ; intermediates back to/from memory to get a 16x16 transpose going...
+ ; This would be easier if we had more (e.g. 32) YMM regs here.
+ mova [tmpq+ 7*%%str], m9
+ mova [tmpq+11*%%str], m12
+ mova [tmpq+15*%%str], m11
+ mova [tmpq+19*%%str], m2
+ mova [tmpq+23*%%str], m3
+ mova [tmpq+27*%%str], m13
+ mova [tmpq+31*%%str], m10
+ mova [tmpq+12*%%str], m5
+
+ mova m13, [tmpq+30*%%str] ; t8
+ mova m12, [tmpq+26*%%str] ; t9
+ mova m11, [tmpq+22*%%str] ; t10
+ mova m10, [tmpq+18*%%str] ; t11
+ mova m9, [tmpq+17*%%str] ; t20
+ mova m8, [tmpq+ 1*%%str] ; t21
+ mova m3, [tmpq+25*%%str] ; t22
+ mova m2, [tmpq+ 5*%%str] ; t23
+
+ SUMSUB_BA w, 9, 10, 5
+ SUMSUB_BA w, 8, 11, 5
+ SUMSUB_BA w, 3, 12, 5
+ SUMSUB_BA w, 2, 13, 5
+ mova [tmpq+ 1*%%str], m10
+ mova [tmpq+ 5*%%str], m11
+ mova [tmpq+17*%%str], m12
+ mova [tmpq+25*%%str], m13
+
+ mova m13, [tmpq+14*%%str] ; t12
+ mova m12, [tmpq+10*%%str] ; t13
+ mova m11, [tmpq+ 9*%%str] ; t18
+ mova m10, [tmpq+13*%%str] ; t19
+
+ SUMSUB_BA w, 11, 12, 5
+ SUMSUB_BA w, 10, 13, 5
+ mova [tmpq+ 9*%%str], m13
+ mova [tmpq+13*%%str], m12
+ mova [tmpq+10*%%str], m10
+ mova [tmpq+14*%%str], m11
+
+ mova m13, [tmpq+ 6*%%str] ; t14
+ mova m12, [tmpq+ 2*%%str] ; t15
+ mova m11, [tmpq+21*%%str] ; t16
+ mova m10, [tmpq+29*%%str] ; t17
+ SUMSUB_BA w, 11, 12, 5
+ SUMSUB_BA w, 10, 13, 5
+ mova [tmpq+21*%%str], m12
+ mova [tmpq+29*%%str], m13
+ mova m12, [tmpq+10*%%str]
+ mova m13, [tmpq+14*%%str]
+
+ TRANSPOSE16x16W 6, 0, 15, 14, 1, 7, 5, 4, \
+ 2, 3, 8, 9, 12, 13, 10, 11, \
+ [tmpq+12*%%str], [tmpq+ 8*%%str], 1
+ mova [tmpq+ 0*%%str], m6
+ mova [tmpq+ 2*%%str], m0
+ mova [tmpq+ 4*%%str], m15
+ mova [tmpq+ 6*%%str], m14
+ mova [tmpq+10*%%str], m7
+ mova [tmpq+12*%%str], m5
+ mova [tmpq+14*%%str], m4
+ mova [tmpq+16*%%str], m2
+ mova [tmpq+18*%%str], m3
+ mova [tmpq+20*%%str], m8
+ mova [tmpq+22*%%str], m9
+ mova [tmpq+24*%%str], m12
+ mova [tmpq+26*%%str], m13
+ mova [tmpq+28*%%str], m10
+ mova [tmpq+30*%%str], m11
+
+ mova m0, [tmpq+21*%%str]
+ mova m1, [tmpq+29*%%str]
+ mova m2, [tmpq+13*%%str]
+ mova m3, [tmpq+ 9*%%str]
+ mova m4, [tmpq+ 1*%%str]
+ mova m5, [tmpq+ 5*%%str]
+ mova m7, [tmpq+25*%%str]
+ mova m8, [tmpq+31*%%str]
+ mova m9, [tmpq+27*%%str]
+ mova m10, [tmpq+23*%%str]
+ mova m11, [tmpq+19*%%str]
+ mova m12, [tmpq+15*%%str]
+ mova m13, [tmpq+11*%%str]
+ mova m14, [tmpq+ 7*%%str]
+ mova m15, [tmpq+ 3*%%str]
+ TRANSPOSE16x16W 0, 1, 2, 3, 4, 5, 6, 7, \
+ 8, 9, 10, 11, 12, 13, 14, 15, \
+ [tmpq+17*%%str], [tmpq+ 9*%%str], 1
+ mova [tmpq+ 1*%%str], m0
+ mova [tmpq+ 3*%%str], m1
+ mova [tmpq+ 5*%%str], m2
+ mova [tmpq+ 7*%%str], m3
+ mova [tmpq+11*%%str], m5
+ mova [tmpq+13*%%str], m6
+ mova [tmpq+15*%%str], m7
+ mova [tmpq+17*%%str], m8
+ mova [tmpq+19*%%str], m9
+ mova [tmpq+21*%%str], m10
+ mova [tmpq+23*%%str], m11
+ mova [tmpq+25*%%str], m12
+ mova [tmpq+27*%%str], m13
+ mova [tmpq+29*%%str], m14
+ mova [tmpq+31*%%str], m15
+%else ; !avx2
+ TRANSPOSE8x8W 6, 0, 15, 14, 1, 7, 5, 4, 8
+ mova [tmpq+ 0*%%str], m6
+ mova [tmpq+ 4*%%str], m0
+ mova [tmpq+ 8*%%str], m15
+ mova [tmpq+12*%%str], m14
+ mova [tmpq+16*%%str], m1
+ mova [tmpq+20*%%str], m7
+ mova [tmpq+24*%%str], m5
+ mova [tmpq+28*%%str], m4
+
+ mova m8, [tmpq+ 3*%%str] ; t15
+ TRANSPOSE8x8W 10, 13, 3, 2, 11, 12, 9, 8, 0
+ mova [tmpq+ 3*%%str], m10
+ mova [tmpq+ 7*%%str], m13
+ mova [tmpq+11*%%str], m3
+ mova [tmpq+15*%%str], m2
+ mova [tmpq+19*%%str], m11
+ mova [tmpq+23*%%str], m12
+ mova [tmpq+27*%%str], m9
+ mova [tmpq+31*%%str], m8
+
+ mova m15, [tmpq+30*%%str] ; t8
+ mova m14, [tmpq+26*%%str] ; t9
+ mova m13, [tmpq+22*%%str] ; t10
+ mova m12, [tmpq+18*%%str] ; t11
+ mova m11, [tmpq+14*%%str] ; t12
+ mova m10, [tmpq+10*%%str] ; t13
+ mova m9, [tmpq+ 6*%%str] ; t14
+ mova m8, [tmpq+ 2*%%str] ; t15
+ mova m7, [tmpq+21*%%str] ; t16
+ mova m6, [tmpq+29*%%str] ; t17
+ mova m5, [tmpq+ 9*%%str] ; t18
+ mova m4, [tmpq+13*%%str] ; t19
+ mova m3, [tmpq+17*%%str] ; t20
+ mova m2, [tmpq+ 1*%%str] ; t21
+ mova m1, [tmpq+25*%%str] ; t22
+
+ SUMSUB_BA w, 7, 8, 0
+ mova [tmpq+ 2*%%str], m8
+ mova m0, [tmpq+ 5*%%str] ; t23
+ SUMSUB_BA w, 6, 9, 8
+ SUMSUB_BA w, 5, 10, 8
+ SUMSUB_BA w, 4, 11, 8
+ SUMSUB_BA w, 3, 12, 8
+ SUMSUB_BA w, 2, 13, 8
+ SUMSUB_BA w, 1, 14, 8
+ SUMSUB_BA w, 0, 15, 8
+
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
+ mova [tmpq+ 1*%%str], m0
+ mova [tmpq+ 5*%%str], m1
+ mova [tmpq+ 9*%%str], m2
+ mova [tmpq+13*%%str], m3
+ mova [tmpq+17*%%str], m4
+ mova [tmpq+21*%%str], m5
+ mova [tmpq+25*%%str], m6
+ mova [tmpq+29*%%str], m7
+
+ mova m8, [tmpq+ 2*%%str]
+ TRANSPOSE8x8W 8, 9, 10, 11, 12, 13, 14, 15, 0
+ mova [tmpq+ 2*%%str], m8
+ mova [tmpq+ 6*%%str], m9
+ mova [tmpq+10*%%str], m10
+ mova [tmpq+14*%%str], m11
+ mova [tmpq+18*%%str], m12
+ mova [tmpq+22*%%str], m13
+ mova [tmpq+26*%%str], m14
+ mova [tmpq+30*%%str], m15
+%endif ; avx2
+%else
+ mova m2, [tmpq+24*%%str] ; t6
+ mova m3, [tmpq+28*%%str] ; t7
+ SUMSUB_BADC w, 5, 2, 4, 3
+ mova [tmpq+24*%%str], m5
+ mova [tmpq+23*%%str], m2
+ mova [tmpq+28*%%str], m4
+ mova [tmpq+19*%%str], m3
+
+ mova m2, [tmpq+16*%%str] ; t4
+ mova m3, [tmpq+20*%%str] ; t5
+ SUMSUB_BA w, 1, 2, 5
+ SUMSUB_BA w, 7, 3, 5
+ mova [tmpq+15*%%str], m2
+ mova [tmpq+11*%%str], m3
+
+ mova m2, [tmpq+ 0*%%str] ; t0
+ mova m3, [tmpq+ 4*%%str] ; t1
+ SUMSUB_BA w, 6, 2, 5
+ SUMSUB_BA w, 0, 3, 5
+ mova [tmpq+31*%%str], m2
+ mova [tmpq+27*%%str], m3
+
+ mova m2, [tmpq+ 8*%%str] ; t2
+ mova m3, [tmpq+12*%%str] ; t3
+ mova m5, [tmpq+ 7*%%str]
+ mova m4, [tmpq+ 3*%%str]
+ SUMSUB_BADC w, 5, 2, 4, 3
+ mova [tmpq+ 7*%%str], m2
+ mova [tmpq+ 3*%%str], m3
+
+ mova m3, [tmpq+28*%%str]
+ TRANSPOSE8x8W 6, 0, 5, 4, 1, 7, 2, 3, [tmpq+24*%%str], [tmpq+16*%%str], 1
+ mova [tmpq+ 0*%%str], m6
+ mova [tmpq+ 4*%%str], m0
+ mova [tmpq+ 8*%%str], m5
+ mova [tmpq+12*%%str], m4
+ mova [tmpq+20*%%str], m7
+ mova [tmpq+24*%%str], m2
+ mova [tmpq+28*%%str], m3
+
+ mova m6, [tmpq+19*%%str]
+ mova m0, [tmpq+23*%%str]
+ mova m5, [tmpq+11*%%str]
+ mova m4, [tmpq+15*%%str]
+ mova m1, [tmpq+ 3*%%str]
+ mova m7, [tmpq+ 7*%%str]
+ mova m3, [tmpq+31*%%str]
+ TRANSPOSE8x8W 6, 0, 5, 4, 1, 7, 2, 3, [tmpq+27*%%str], [tmpq+19*%%str], 1
+ mova [tmpq+ 3*%%str], m6
+ mova [tmpq+ 7*%%str], m0
+ mova [tmpq+11*%%str], m5
+ mova [tmpq+15*%%str], m4
+ mova [tmpq+23*%%str], m7
+ mova [tmpq+27*%%str], m2
+ mova [tmpq+31*%%str], m3
+
+ mova m1, [tmpq+ 6*%%str] ; t14
+ mova m0, [tmpq+ 2*%%str] ; t15
+ mova m7, [tmpq+21*%%str] ; t16
+ mova m6, [tmpq+29*%%str] ; t17
+ SUMSUB_BA w, 7, 0, 2
+ SUMSUB_BA w, 6, 1, 2
+ mova [tmpq+29*%%str], m7
+ mova [tmpq+ 2*%%str], m0
+ mova [tmpq+21*%%str], m6
+ mova [tmpq+ 6*%%str], m1
+
+ mova m1, [tmpq+14*%%str] ; t12
+ mova m0, [tmpq+10*%%str] ; t13
+ mova m5, [tmpq+ 9*%%str] ; t18
+ mova m4, [tmpq+13*%%str] ; t19
+ SUMSUB_BA w, 5, 0, 2
+ SUMSUB_BA w, 4, 1, 2
+ mova [tmpq+10*%%str], m0
+ mova [tmpq+14*%%str], m1
+
+ mova m1, [tmpq+22*%%str] ; t10
+ mova m0, [tmpq+18*%%str] ; t11
+ mova m3, [tmpq+17*%%str] ; t20
+ mova m2, [tmpq+ 1*%%str] ; t21
+ SUMSUB_BA w, 3, 0, 6
+ SUMSUB_BA w, 2, 1, 6
+ mova [tmpq+18*%%str], m0
+ mova [tmpq+22*%%str], m1
+
+ mova m7, [tmpq+30*%%str] ; t8
+ mova m6, [tmpq+26*%%str] ; t9
+ mova m1, [tmpq+25*%%str] ; t22
+ mova m0, [tmpq+ 5*%%str] ; t23
+ SUMSUB_BADC w, 1, 6, 0, 7
+ mova [tmpq+26*%%str], m6
+ mova [tmpq+30*%%str], m7
+
+ mova m7, [tmpq+29*%%str]
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [tmpq+21*%%str], [tmpq+17*%%str], 1
+ mova [tmpq+ 1*%%str], m0
+ mova [tmpq+ 5*%%str], m1
+ mova [tmpq+ 9*%%str], m2
+ mova [tmpq+13*%%str], m3
+ mova [tmpq+21*%%str], m5
+ mova [tmpq+25*%%str], m6
+ mova [tmpq+29*%%str], m7
+
+ mova m0, [tmpq+ 2*%%str]
+ mova m1, [tmpq+ 6*%%str]
+ mova m2, [tmpq+10*%%str]
+ mova m3, [tmpq+14*%%str]
+ mova m4, [tmpq+18*%%str]
+ mova m5, [tmpq+22*%%str]
+ mova m7, [tmpq+30*%%str]
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [tmpq+26*%%str], [tmpq+18*%%str], 1
+ mova [tmpq+ 2*%%str], m0
+ mova [tmpq+ 6*%%str], m1
+ mova [tmpq+10*%%str], m2
+ mova [tmpq+14*%%str], m3
+ mova [tmpq+22*%%str], m5
+ mova [tmpq+26*%%str], m6
+ mova [tmpq+30*%%str], m7
+%endif
+%else
+ ; t0-7 is in [tmpq+{0,4,8,12,16,20,24,28}*%%str]
+ ; t8-15 is in [tmpq+{2,6,10,14,18,22,26,30}*%%str]
+ ; t16-19 and t23 is in [tmpq+{1,5,9,13,29}*%%str]
+ ; t20-22 is in m4-6
+ ; t24-31 is in m8-15
+
+%if cpuflag(ssse3)
+%define ROUND_REG [pw_512]
+%else
+%define ROUND_REG [pw_32]
+%endif
+
+%macro %%STORE_2X2 7-8 1 ; src[1-4], tmp[1-2], zero, inc_dst_ptrs
+ SUMSUB_BA w, %4, %1, %5
+ SUMSUB_BA w, %3, %2, %5
+ VP9_IDCT8_WRITEx2 %4, %3, %5, %6, %7, ROUND_REG, 6
+%if %8 == 1
+ add dstq, stride2q
+%endif
+ VP9_IDCT8_WRITEx2 %2, %1, %5, %6, %7, ROUND_REG, 6, dst_endq
+%if %8 == 1
+ sub dst_endq, stride2q
+%endif
+%endmacro
+
+%if ARCH_X86_64
+ pxor m10, m10
+
+ ; store t0-1 and t30-31
+ mova m8, [tmpq+ 0*%%str]
+ mova m9, [tmpq+ 4*%%str]
+ %%STORE_2X2 8, 9, 0, 6, 12, 11, 10
+
+ ; store t2-3 and t28-29
+ mova m8, [tmpq+ 8*%%str]
+ mova m9, [tmpq+12*%%str]
+ %%STORE_2X2 8, 9, 14, 15, 12, 11, 10
+
+ ; store t4-5 and t26-27
+ mova m8, [tmpq+16*%%str]
+ mova m9, [tmpq+20*%%str]
+ %%STORE_2X2 8, 9, 7, 1, 12, 11, 10
+
+ ; store t6-7 and t24-25
+ mova m8, [tmpq+24*%%str]
+ mova m9, [tmpq+28*%%str]
+ %%STORE_2X2 8, 9, 4, 5, 12, 11, 10
+
+ ; store t8-9 and t22-23
+ mova m8, [tmpq+30*%%str]
+ mova m9, [tmpq+26*%%str]
+ mova m0, [tmpq+ 5*%%str]
+ %%STORE_2X2 8, 9, 13, 0, 12, 11, 10
+
+ ; store t10-11 and t20-21
+ mova m8, [tmpq+22*%%str]
+ mova m9, [tmpq+18*%%str]
+ %%STORE_2X2 8, 9, 2, 3, 12, 11, 10
+
+ ; store t12-13 and t18-19
+ mova m8, [tmpq+14*%%str]
+ mova m9, [tmpq+10*%%str]
+ mova m5, [tmpq+13*%%str]
+ mova m4, [tmpq+ 9*%%str]
+ %%STORE_2X2 8, 9, 4, 5, 12, 11, 10
+
+ ; store t14-17
+ mova m8, [tmpq+ 6*%%str]
+ mova m9, [tmpq+ 2*%%str]
+ mova m5, [tmpq+29*%%str]
+ mova m4, [tmpq+21*%%str]
+ %%STORE_2X2 8, 9, 4, 5, 12, 11, 10, 0
+
+ SWAP 1, 10 ; zero
+%else
+ mova [tmpq+ 1*%%str], m1
+ mova [tmpq+11*%%str], m2
+ mova [tmpq+15*%%str], m3
+ mova [tmpq+17*%%str], m4
+ mova [tmpq+19*%%str], m5
+ pxor m1, m1
+
+ ; store t0-1 and t30-31
+ mova m2, [tmpq+ 0*%%str]
+ mova m3, [tmpq+ 4*%%str]
+ %%STORE_2X2 2, 3, 0, 6, 4, 5, 1
+
+ ; store t2-3 and t28-29
+ mova m2, [tmpq+ 8*%%str]
+ mova m3, [tmpq+12*%%str]
+ mova m0, [tmpq+ 3*%%str]
+ mova m6, [tmpq+ 7*%%str]
+ %%STORE_2X2 2, 3, 0, 6, 4, 5, 1
+
+ ; store t4-5 and t26-27
+ mova m2, [tmpq+16*%%str]
+ mova m3, [tmpq+20*%%str]
+ mova m0, [tmpq+ 1*%%str]
+ %%STORE_2X2 2, 3, 7, 0, 4, 5, 1
+
+ ; store t6-7 and t24-25
+ mova m2, [tmpq+24*%%str]
+ mova m3, [tmpq+28*%%str]
+ mova m0, [tmpq+17*%%str]
+ mova m6, [tmpq+19*%%str]
+ %%STORE_2X2 2, 3, 0, 6, 4, 5, 1
+
+ ; store t8-9 and t22-23
+ mova m2, [tmpq+30*%%str]
+ mova m3, [tmpq+26*%%str]
+ mova m0, [tmpq+25*%%str]
+ mova m6, [tmpq+ 5*%%str]
+ %%STORE_2X2 2, 3, 0, 6, 4, 5, 1
+
+ ; store t10-11 and t20-21
+ mova m2, [tmpq+22*%%str]
+ mova m3, [tmpq+18*%%str]
+ mova m0, [tmpq+11*%%str]
+ mova m6, [tmpq+15*%%str]
+ %%STORE_2X2 2, 3, 0, 6, 4, 5, 1
+
+ ; store t12-13 and t18-19
+ mova m2, [tmpq+14*%%str]
+ mova m3, [tmpq+10*%%str]
+ mova m6, [tmpq+13*%%str]
+ mova m0, [tmpq+ 9*%%str]
+ %%STORE_2X2 2, 3, 0, 6, 4, 5, 1
+
+ ; store t14-17
+ mova m2, [tmpq+ 6*%%str]
+ mova m3, [tmpq+ 2*%%str]
+ mova m6, [tmpq+29*%%str]
+ mova m0, [tmpq+21*%%str]
+ %%STORE_2X2 2, 3, 0, 6, 4, 5, 1, 0
+%endif
+%undef ROUND_REG
+%endif
+%endmacro
+
+%macro VP9_IDCT_IDCT_32x32_ADD_XMM 1
+INIT_XMM %1
+cglobal vp9_idct_idct_32x32_add, 0, 6 + ARCH_X86_64 * 3, 16, 2048, dst, stride, block, eob
+ movifnidn eobd, dword eobm
+%if cpuflag(ssse3)
+ cmp eobd, 135
+ jg .idctfull
+ cmp eobd, 34
+ jg .idct16x16
+ cmp eobd, 1
+ jg .idct8x8
+%else
+ cmp eobd, 1
+ jg .idctfull
+%endif
+
+ ; dc-only case
+ movifnidn blockq, blockmp
+ movifnidn dstq, dstmp
+ movifnidn strideq, stridemp
+%if cpuflag(ssse3)
+ movd m0, [blockq]
+ mova m1, [pw_11585x2]
+ pmulhrsw m0, m1
+ pmulhrsw m0, m1
+%else
+ DEFINE_ARGS dst, stride, block, coef
+ movsx coefd, word [blockq]
+ imul coefd, 11585
+ add coefd, 8192
+ sar coefd, 14
+ imul coefd, 11585
+ add coefd, (32 << 14) + 8192
+ sar coefd, 14 + 6
+ movd m0, coefd
+%endif
+ SPLATW m0, m0, q0000
+%if cpuflag(ssse3)
+ pmulhrsw m0, [pw_512]
+%endif
+ pxor m5, m5
+ movd [blockq], m5
+%rep 31
+ VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5, mmsize
+ add dstq, strideq
+%endrep
+ VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5, mmsize
+ RET
+
+%if ARCH_X86_64
+ DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2, tmp
+%else
+%define dst_bakq r0mp
+%endif
+%if cpuflag(ssse3)
+.idct8x8:
+%if ARCH_X86_32
+ DEFINE_ARGS block, u1, u2, u3, u4, tmp
+ mov blockq, r2mp
+%endif
+ mov tmpq, rsp
+ VP9_IDCT32_1D blockq, 1, 8
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp
+ mov strideq, r1mp
+%define cntd dword r3m
+%endif
+ mov stride30q, strideq ; stride
+ lea stride2q, [strideq*2] ; stride*2
+ shl stride30q, 5 ; stride*32
+ mov cntd, 4
+ sub stride30q, stride2q ; stride*30
+.loop2_8x8:
+ mov dstq, dst_bakq
+ lea dst_endq, [dstq+stride30q]
+ VP9_IDCT32_1D tmpq, 2, 8
+ add dst_bakq, 8
+ add tmpq, 16
+ dec cntd
+ jg .loop2_8x8
+
+ ; at the end of the loop, m7 should still be zero
+ ; use that to zero out block coefficients
+%if ARCH_X86_32
+ DEFINE_ARGS block
+ mov blockq, r2mp
+%endif
+ ZERO_BLOCK blockq, 64, 8, m1
+ RET
+
+.idct16x16:
+%if ARCH_X86_32
+ DEFINE_ARGS block, tmp, cnt
+ mov blockq, r2mp
+%endif
+ mov cntd, 2
+ mov tmpq, rsp
+.loop1_16x16:
+ VP9_IDCT32_1D blockq, 1, 16
+ add blockq, 16
+ add tmpq, 512
+ dec cntd
+ jg .loop1_16x16
+
+%if ARCH_X86_64
+ sub blockq, 32
+%else
+ DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp
+ mov strideq, r1mp
+%define cntd dword r3m
+%endif
+
+ mov stride30q, strideq ; stride
+ lea stride2q, [strideq*2] ; stride*2
+ shl stride30q, 5 ; stride*32
+ mov cntd, 4
+ mov tmpq, rsp
+ sub stride30q, stride2q ; stride*30
+.loop2_16x16:
+ mov dstq, dst_bakq
+ lea dst_endq, [dstq+stride30q]
+ VP9_IDCT32_1D tmpq, 2, 16
+ add dst_bakq, 8
+ add tmpq, 16
+ dec cntd
+ jg .loop2_16x16
+
+ ; at the end of the loop, m7 should still be zero
+ ; use that to zero out block coefficients
+%if ARCH_X86_32
+ DEFINE_ARGS block
+ mov blockq, r2mp
+%endif
+ ZERO_BLOCK blockq, 64, 16, m1
+ RET
+%endif
+
+.idctfull:
+%if ARCH_X86_32
+ DEFINE_ARGS block, tmp, cnt
+ mov blockq, r2mp
+%endif
+ mov cntd, 4
+ mov tmpq, rsp
+.loop1_full:
+ VP9_IDCT32_1D blockq, 1
+ add blockq, 16
+ add tmpq, 512
+ dec cntd
+ jg .loop1_full
+
+%if ARCH_X86_64
+ sub blockq, 64
+%else
+ DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp
+ mov strideq, r1mp
+%define cntd dword r3m
+%endif
+
+ mov stride30q, strideq ; stride
+ lea stride2q, [strideq*2] ; stride*2
+ shl stride30q, 5 ; stride*32
+ mov cntd, 4
+ mov tmpq, rsp
+ sub stride30q, stride2q ; stride*30
+.loop2_full:
+ mov dstq, dst_bakq
+ lea dst_endq, [dstq+stride30q]
+ VP9_IDCT32_1D tmpq, 2
+ add dst_bakq, 8
+ add tmpq, 16
+ dec cntd
+ jg .loop2_full
+
+ ; at the end of the loop, m7 should still be zero
+ ; use that to zero out block coefficients
+%if ARCH_X86_32
+ DEFINE_ARGS block
+ mov blockq, r2mp
+%endif
+ ZERO_BLOCK blockq, 64, 32, m1
+ RET
+%endmacro
+
+VP9_IDCT_IDCT_32x32_ADD_XMM sse2
+VP9_IDCT_IDCT_32x32_ADD_XMM ssse3
+VP9_IDCT_IDCT_32x32_ADD_XMM avx
+
+; this is almost identical to VP9_STORE_2X, but it does two rows
+; for slightly improved interleaving, and it omits vpermq since the
+; input is DC so all values are identical
+%macro VP9_STORE_YMM_DC_2X2 6 ; reg, tmp1, tmp2, tmp3, tmp4, zero
+ mova m%2, [dstq]
+ mova m%4, [dstq+strideq]
+ punpckhbw m%3, m%2, m%6
+ punpcklbw m%2, m%6
+ punpckhbw m%5, m%4, m%6
+ punpcklbw m%4, m%6
+ paddw m%3, m%1
+ paddw m%2, m%1
+ paddw m%5, m%1
+ paddw m%4, m%1
+ packuswb m%2, m%3
+ packuswb m%4, m%5
+ mova [dstq+strideq*0], m%2
+ mova [dstq+strideq*1], m%4
+%endmacro
+
+%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_idct_idct_32x32_add, 4, 9, 16, 2048, dst, stride, block, eob
+ cmp eobd, 135
+ jg .idctfull
+ cmp eobd, 1
+ jg .idct16x16
+
+ ; dc-only case
+ mova m1, [pw_11585x2]
+ vpbroadcastw m0, [blockq]
+ pmulhrsw m0, m1
+ pmulhrsw m0, m1
+ pxor m5, m5
+ pmulhrsw m0, [pw_512]
+ movd [blockq], xm5
+
+ DEFINE_ARGS dst, stride, cnt
+ mov cntd, 16
+.loop_dc:
+ VP9_STORE_YMM_DC_2X2 0, 1, 2, 3, 4, 5
+ lea dstq, [dstq+2*strideq]
+ dec cntd
+ jg .loop_dc
+ RET
+
+ DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2, tmp
+.idct16x16:
+ mov tmpq, rsp
+ VP9_IDCT32_1D blockq, 1, 16
+
+ mov stride30q, strideq ; stride
+ lea stride2q, [strideq*2] ; stride*2
+ shl stride30q, 5 ; stride*32
+ mov cntd, 2
+ sub stride30q, stride2q ; stride*30
+.loop2_16x16:
+ mov dstq, dst_bakq
+ lea dst_endq, [dstq+stride30q]
+ VP9_IDCT32_1D tmpq, 2, 16
+ add dst_bakq, 16
+ add tmpq, 32
+ dec cntd
+ jg .loop2_16x16
+
+ ; at the end of the loop, m1 should still be zero
+ ; use that to zero out block coefficients
+ ZERO_BLOCK blockq, 64, 16, m1
+ RET
+
+.idctfull:
+ mov cntd, 2
+ mov tmpq, rsp
+.loop1_full:
+ VP9_IDCT32_1D blockq, 1
+ add blockq, 32
+ add tmpq, 1024
+ dec cntd
+ jg .loop1_full
+
+ sub blockq, 64
+
+ mov stride30q, strideq ; stride
+ lea stride2q, [strideq*2] ; stride*2
+ shl stride30q, 5 ; stride*32
+ mov cntd, 2
+ mov tmpq, rsp
+ sub stride30q, stride2q ; stride*30
+.loop2_full:
+ mov dstq, dst_bakq
+ lea dst_endq, [dstq+stride30q]
+ VP9_IDCT32_1D tmpq, 2
+ add dst_bakq, 16
+ add tmpq, 32
+ dec cntd
+ jg .loop2_full
+
+ ; at the end of the loop, m1 should still be zero
+ ; use that to zero out block coefficients
+ ZERO_BLOCK blockq, 64, 32, m1
+ RET
+%endif
diff --git a/libs/ffvpx/libavcodec/x86/vp9itxfm_16bpp.asm b/libs/ffvpx/libavcodec/x86/vp9itxfm_16bpp.asm
new file mode 100644
index 000000000..902685edf
--- /dev/null
+++ b/libs/ffvpx/libavcodec/x86/vp9itxfm_16bpp.asm
@@ -0,0 +1,2044 @@
+;******************************************************************************
+;* VP9 inverse transform x86 SIMD optimizations
+;*
+;* Copyright (C) 2015 Ronald S. Bultje <rsbultje gmail com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+%include "vp9itxfm_template.asm"
+
+SECTION_RODATA
+
+cextern pw_8
+cextern pw_1023
+cextern pw_2048
+cextern pw_4095
+cextern pw_m1
+cextern pd_1
+cextern pd_16
+cextern pd_32
+cextern pd_8192
+
+pd_8: times 4 dd 8
+pd_3fff: times 4 dd 0x3fff
+
+cextern pw_11585x2
+
+cextern pw_5283_13377
+cextern pw_9929_13377
+cextern pw_15212_m13377
+cextern pw_15212_9929
+cextern pw_m5283_m15212
+cextern pw_13377x2
+cextern pw_m13377_13377
+cextern pw_13377_0
+
+pw_9929_m5283: times 4 dw 9929, -5283
+
+%macro COEF_PAIR 2-3
+cextern pw_m%1_%2
+cextern pw_%2_%1
+%if %0 == 3
+cextern pw_m%1_m%2
+%if %1 != %2
+cextern pw_m%2_%1
+cextern pw_%1_%2
+%endif
+%endif
+%endmacro
+
+COEF_PAIR 2404, 16207
+COEF_PAIR 3196, 16069, 1
+COEF_PAIR 4756, 15679
+COEF_PAIR 5520, 15426
+COEF_PAIR 6270, 15137, 1
+COEF_PAIR 8423, 14053
+COEF_PAIR 10394, 12665
+COEF_PAIR 11003, 12140
+COEF_PAIR 11585, 11585, 1
+COEF_PAIR 13160, 9760
+COEF_PAIR 13623, 9102, 1
+COEF_PAIR 14449, 7723
+COEF_PAIR 14811, 7005
+COEF_PAIR 15893, 3981
+COEF_PAIR 16305, 1606
+COEF_PAIR 16364, 804
+
+default_8x8:
+times 12 db 1
+times 52 db 2
+row_8x8:
+times 18 db 1
+times 46 db 2
+col_8x8:
+times 6 db 1
+times 58 db 2
+default_16x16:
+times 10 db 1
+times 28 db 2
+times 51 db 3
+times 167 db 4
+row_16x16:
+times 21 db 1
+times 45 db 2
+times 60 db 3
+times 130 db 4
+col_16x16:
+times 5 db 1
+times 12 db 2
+times 25 db 3
+times 214 db 4
+default_32x32:
+times 9 db 1
+times 25 db 2
+times 36 db 3
+times 65 db 4
+times 105 db 5
+times 96 db 6
+times 112 db 7
+times 576 db 8
+
+SECTION .text
+
+%macro VP9_STORE_2X 6-7 dstq ; reg1, reg2, tmp1, tmp2, min, max, dst
+ mova m%3, [%7]
+ mova m%4, [%7+strideq]
+ paddw m%3, m%1
+ paddw m%4, m%2
+ pmaxsw m%3, m%5
+ pmaxsw m%4, m%5
+ pminsw m%3, m%6
+ pminsw m%4, m%6
+ mova [%7], m%3
+ mova [%7+strideq], m%4
+%endmacro
+
+%macro ZERO_BLOCK 4 ; mem, stride, nnzcpl, zero_reg
+%assign %%y 0
+%rep %3
+%assign %%x 0
+%rep %3*4/mmsize
+ mova [%1+%%y+%%x], %4
+%assign %%x (%%x+mmsize)
+%endrep
+%assign %%y (%%y+%2)
+%endrep
+%endmacro
+
+; the input coefficients are scaled up by 2 bit (which we downscale immediately
+; in the iwht), and is otherwise orthonormally increased by 1 bit per iwht_1d.
+; therefore, a diff of 10-12+sign bit will fit in 12-14+sign bit after scaling,
+; i.e. everything can be done in 15+1bpp words. Since the quant fractional bits
+; add 2 bits, we need to scale before converting to word in 12bpp, since the
+; input will be 16+sign bit which doesn't fit in 15+sign words, but in 10bpp
+; we can scale after converting to words (which is half the instructions),
+; since the input is only 14+sign bit, which fits in 15+sign words directly.
+
+%macro IWHT4_FN 2 ; bpp, max
+cglobal vp9_iwht_iwht_4x4_add_%1, 3, 3, 8, dst, stride, block, eob
+ mova m7, [pw_%2]
+ mova m0, [blockq+0*16+0]
+ mova m1, [blockq+1*16+0]
+%if %1 >= 12
+ mova m4, [blockq+0*16+8]
+ mova m5, [blockq+1*16+8]
+ psrad m0, 2
+ psrad m1, 2
+ psrad m4, 2
+ psrad m5, 2
+ packssdw m0, m4
+ packssdw m1, m5
+%else
+ packssdw m0, [blockq+0*16+8]
+ packssdw m1, [blockq+1*16+8]
+ psraw m0, 2
+ psraw m1, 2
+%endif
+ mova m2, [blockq+2*16+0]
+ mova m3, [blockq+3*16+0]
+%if %1 >= 12
+ mova m4, [blockq+2*16+8]
+ mova m5, [blockq+3*16+8]
+ psrad m2, 2
+ psrad m3, 2
+ psrad m4, 2
+ psrad m5, 2
+ packssdw m2, m4
+ packssdw m3, m5
+%else
+ packssdw m2, [blockq+2*16+8]
+ packssdw m3, [blockq+3*16+8]
+ psraw m2, 2
+ psraw m3, 2
+%endif
+
+ VP9_IWHT4_1D
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+ VP9_IWHT4_1D
+
+ pxor m6, m6
+ VP9_STORE_2X 0, 1, 4, 5, 6, 7
+ lea dstq, [dstq+strideq*2]
+ VP9_STORE_2X 2, 3, 4, 5, 6, 7
+ ZERO_BLOCK blockq, 16, 4, m6
+ RET
+%endmacro
+
+INIT_MMX mmxext
+IWHT4_FN 10, 1023
+INIT_MMX mmxext
+IWHT4_FN 12, 4095
+
+%macro VP9_IDCT4_WRITEOUT 0
+%if cpuflag(ssse3)
+ mova m5, [pw_2048]
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ pmulhrsw m2, m5
+ pmulhrsw m3, m5
+%else
+ mova m5, [pw_8]
+ paddw m0, m5
+ paddw m1, m5
+ paddw m2, m5
+ paddw m3, m5
+ psraw m0, 4
+ psraw m1, 4
+ psraw m2, 4
+ psraw m3, 4
+%endif
+ mova m5, [pw_1023]
+ VP9_STORE_2X 0, 1, 6, 7, 4, 5
+ lea dstq, [dstq+2*strideq]
+ VP9_STORE_2X 2, 3, 6, 7, 4, 5
+%endmacro
+
+%macro DC_ONLY 2 ; shift, zero
+ mov coefd, dword [blockq]
+ movd [blockq], %2
+ imul coefd, 11585
+ add coefd, 8192
+ sar coefd, 14
+ imul coefd, 11585
+ add coefd, ((1 << (%1 - 1)) << 14) + 8192
+ sar coefd, 14 + %1
+%endmacro
+
+; 4x4 coefficients are 5+depth+sign bits, so for 10bpp, everything still fits
+; in 15+1 words without additional effort, since the coefficients are 15bpp.
+
+%macro IDCT4_10_FN 0
+cglobal vp9_idct_idct_4x4_add_10, 4, 4, 8, dst, stride, block, eob
+ cmp eobd, 1
+ jg .idctfull
+
+ ; dc-only
+ pxor m4, m4
+%if cpuflag(ssse3)
+ movd m0, [blockq]
+ movd [blockq], m4
+ mova m5, [pw_11585x2]
+ pmulhrsw m0, m5
+ pmulhrsw m0, m5
+%else
+ DEFINE_ARGS dst, stride, block, coef
+ DC_ONLY 4, m4
+ movd m0, coefd
+%endif
+ pshufw m0, m0, 0
+ mova m5, [pw_1023]
+%if cpuflag(ssse3)
+ pmulhrsw m0, [pw_2048] ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4
+%endif
+ VP9_STORE_2X 0, 0, 6, 7, 4, 5
+ lea dstq, [dstq+2*strideq]
+ VP9_STORE_2X 0, 0, 6, 7, 4, 5
+ RET
+
+.idctfull:
+ mova m0, [blockq+0*16+0]
+ mova m1, [blockq+1*16+0]
+ packssdw m0, [blockq+0*16+8]
+ packssdw m1, [blockq+1*16+8]
+ mova m2, [blockq+2*16+0]
+ mova m3, [blockq+3*16+0]
+ packssdw m2, [blockq+2*16+8]
+ packssdw m3, [blockq+3*16+8]
+
+%if cpuflag(ssse3)
+ mova m6, [pw_11585x2]
+%endif
+ mova m7, [pd_8192] ; rounding
+ VP9_IDCT4_1D
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+ VP9_IDCT4_1D
+
+ pxor m4, m4
+ ZERO_BLOCK blockq, 16, 4, m4
+ VP9_IDCT4_WRITEOUT
+ RET
+%endmacro
+
+INIT_MMX mmxext
+IDCT4_10_FN
+INIT_MMX ssse3
+IDCT4_10_FN
+
+%macro IADST4_FN 4
+cglobal vp9_%1_%3_4x4_add_10, 3, 3, 0, dst, stride, block, eob
+%if WIN64 && notcpuflag(ssse3)
+ WIN64_SPILL_XMM 8
+%endif
+ movdqa xmm5, [pd_8192]
+ mova m0, [blockq+0*16+0]
+ mova m1, [blockq+1*16+0]
+ packssdw m0, [blockq+0*16+8]
+ packssdw m1, [blockq+1*16+8]
+ mova m2, [blockq+2*16+0]
+ mova m3, [blockq+3*16+0]
+ packssdw m2, [blockq+2*16+8]
+ packssdw m3, [blockq+3*16+8]
+
+%if cpuflag(ssse3)
+ mova m6, [pw_11585x2]
+%endif
+%ifnidn %1%3, iadstiadst
+ movdq2q m7, xmm5
+%endif
+ VP9_%2_1D
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+ VP9_%4_1D
+
+ pxor m4, m4
+ ZERO_BLOCK blockq, 16, 4, m4
+ VP9_IDCT4_WRITEOUT
+ RET
+%endmacro
+
+INIT_MMX sse2
+IADST4_FN idct, IDCT4, iadst, IADST4
+IADST4_FN iadst, IADST4, idct, IDCT4
+IADST4_FN iadst, IADST4, iadst, IADST4
+
+INIT_MMX ssse3
+IADST4_FN idct, IDCT4, iadst, IADST4
+IADST4_FN iadst, IADST4, idct, IDCT4
+IADST4_FN iadst, IADST4, iadst, IADST4
+
+; inputs and outputs are dwords, coefficients are words
+;
+; dst1 = src1 * coef1 + src2 * coef2 + rnd >> 14
+; dst2 = src1 * coef2 - src2 * coef1 + rnd >> 14
+%macro SUMSUB_MUL 6-8 [pd_8192], [pd_3fff] ; src/dst 1-2, tmp1-2, coef1-2, rnd, mask
+ pand m%3, m%1, %8
+ pand m%4, m%2, %8
+ psrad m%1, 14
+ psrad m%2, 14
+ packssdw m%4, m%2
+ packssdw m%3, m%1
+ punpckhwd m%2, m%4, m%3
+ punpcklwd m%4, m%3
+ pmaddwd m%3, m%4, [pw_%6_%5]
+ pmaddwd m%1, m%2, [pw_%6_%5]
+ pmaddwd m%4, [pw_m%5_%6]
+ pmaddwd m%2, [pw_m%5_%6]
+ paddd m%3, %7
+ paddd m%4, %7
+ psrad m%3, 14
+ psrad m%4, 14
+ paddd m%1, m%3
+ paddd m%2, m%4
+%endmacro
+
+%macro IDCT4_12BPP_1D 0-8 [pd_8192], [pd_3fff], 0, 1, 2, 3, 4, 5 ; rnd, mask, in/out0-3, tmp0-1
+ SUMSUB_MUL %3, %5, %7, %8, 11585, 11585, %1, %2
+ SUMSUB_MUL %4, %6, %7, %8, 15137, 6270, %1, %2
+ SUMSUB_BA d, %4, %3, %7
+ SUMSUB_BA d, %6, %5, %7
+ SWAP %4, %6, %3
+%endmacro
+
+%macro STORE_4x4 6 ; tmp1-2, reg1-2, min, max
+ movh m%1, [dstq+strideq*0]
+ movh m%2, [dstq+strideq*2]
+ movhps m%1, [dstq+strideq*1]
+ movhps m%2, [dstq+stride3q ]
+ paddw m%1, m%3
+ paddw m%2, m%4
+ pmaxsw m%1, %5
+ pmaxsw m%2, %5
+ pminsw m%1, %6
+ pminsw m%2, %6
+ movh [dstq+strideq*0], m%1
+ movhps [dstq+strideq*1], m%1
+ movh [dstq+strideq*2], m%2
+ movhps [dstq+stride3q ], m%2
+%endmacro
+
+%macro ROUND_AND_STORE_4x4 8 ; reg1-4, min, max, rnd, shift
+ paddd m%1, %7
+ paddd m%2, %7
+ paddd m%3, %7
+ paddd m%4, %7
+ psrad m%1, %8
+ psrad m%2, %8
+ psrad m%3, %8
+ psrad m%4, %8
+ packssdw m%1, m%2
+ packssdw m%3, m%4
+ STORE_4x4 %2, %4, %1, %3, %5, %6
+%endmacro
+
+INIT_XMM sse2
+cglobal vp9_idct_idct_4x4_add_12, 4, 4, 8, dst, stride, block, eob
+ cmp eobd, 1
+ jg .idctfull
+
+ ; dc-only - this is special, since for 4x4 12bpp, the max coef size is
+ ; 17+sign bpp. Since the multiply is with 11585, which is 14bpp, the
+ ; result of each multiply is 31+sign bit, i.e. it _exactly_ fits in a
+ ; dword. After the final shift (4), the result is 13+sign bits, so we
+ ; don't need any additional processing to fit it in a word
+ DEFINE_ARGS dst, stride, block, coef
+ pxor m4, m4
+ DC_ONLY 4, m4
+ movd m0, coefd
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+ mova m5, [pw_4095]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ STORE_4x4 1, 3, 0, 0, m4, m5
+ RET
+
+.idctfull:
+ DEFINE_ARGS dst, stride, block, eob
+ mova m0, [blockq+0*16]
+ mova m1, [blockq+1*16]
+ mova m2, [blockq+2*16]
+ mova m3, [blockq+3*16]
+ mova m6, [pd_8192]
+ mova m7, [pd_3fff]
+
+ IDCT4_12BPP_1D m6, m7
+ TRANSPOSE4x4D 0, 1, 2, 3, 4
+ IDCT4_12BPP_1D m6, m7
+
+ pxor m4, m4
+ ZERO_BLOCK blockq, 16, 4, m4
+
+ ; writeout
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ mova m5, [pw_4095]
+ mova m6, [pd_8]
+ ROUND_AND_STORE_4x4 0, 1, 2, 3, m4, m5, m6, 4
+ RET
+
+%macro SCRATCH 3-4
+%if ARCH_X86_64
+ SWAP %1, %2
+%if %0 == 4
+%define reg_%4 m%2
+%endif
+%else
+ mova [%3], m%1
+%if %0 == 4
+%define reg_%4 [%3]
+%endif
+%endif
+%endmacro
+
+%macro UNSCRATCH 3-4
+%if ARCH_X86_64
+ SWAP %1, %2
+%else
+ mova m%1, [%3]
+%endif
+%if %0 == 4
+%undef reg_%4
+%endif
+%endmacro
+
+%macro PRELOAD 2-3
+%if ARCH_X86_64
+ mova m%1, [%2]
+%if %0 == 3
+%define reg_%3 m%1
+%endif
+%elif %0 == 3
+%define reg_%3 [%2]
+%endif
+%endmacro
+
+; out0 = 5283 * in0 + 13377 + in1 + 15212 * in2 + 9929 * in3 + rnd >> 14
+; out1 = 9929 * in0 + 13377 * in1 - 5283 * in2 - 15282 * in3 + rnd >> 14
+; out2 = 13377 * in0 - 13377 * in2 + 13377 * in3 + rnd >> 14
+; out3 = 15212 * in0 - 13377 * in1 + 9929 * in2 - 5283 * in3 + rnd >> 14
+%macro IADST4_12BPP_1D 0-2 [pd_8192], [pd_3fff] ; rnd, mask
+ pand m4, m0, %2
+ pand m5, m1, %2
+ psrad m0, 14
+ psrad m1, 14
+ packssdw m5, m1
+ packssdw m4, m0
+ punpckhwd m1, m4, m5
+ punpcklwd m4, m5
+ pand m5, m2, %2
+ pand m6, m3, %2
+ psrad m2, 14
+ psrad m3, 14
+ packssdw m6, m3
+ packssdw m5, m2
+ punpckhwd m3, m5, m6
+ punpcklwd m5, m6
+ SCRATCH 1, 8, rsp+0*mmsize, a
+ SCRATCH 5, 9, rsp+1*mmsize, b
+
+ ; m1/3 have the high bits of 0,1,2,3
+ ; m4/5 have the low bits of 0,1,2,3
+ ; m0/2/6/7 are free
+
+ mova m2, [pw_15212_9929]
+ mova m0, [pw_5283_13377]
+ pmaddwd m7, m2, reg_b
+ pmaddwd m6, m4, m0
+ pmaddwd m2, m3
+ pmaddwd m0, reg_a
+ paddd m6, m7
+ paddd m0, m2
+ mova m1, [pw_m13377_13377]
+ mova m5, [pw_13377_0]
+ pmaddwd m7, m1, reg_b
+ pmaddwd m2, m4, m5
+ pmaddwd m1, m3
+ pmaddwd m5, reg_a
+ paddd m2, m7
+ paddd m1, m5
+ paddd m6, %1
+ paddd m2, %1
+ psrad m6, 14
+ psrad m2, 14
+ paddd m0, m6 ; t0
+ paddd m2, m1 ; t2
+
+ mova m7, [pw_m5283_m15212]
+ mova m5, [pw_9929_13377]
+ pmaddwd m1, m7, reg_b
+ pmaddwd m6, m4, m5
+ pmaddwd m7, m3
+ pmaddwd m5, reg_a
+ paddd m6, m1
+ paddd m7, m5
+ UNSCRATCH 5, 9, rsp+1*mmsize, b
+ pmaddwd m5, [pw_9929_m5283]
+ pmaddwd m4, [pw_15212_m13377]
+ pmaddwd m3, [pw_9929_m5283]
+ UNSCRATCH 1, 8, rsp+0*mmsize, a
+ pmaddwd m1, [pw_15212_m13377]
+ paddd m4, m5
+ paddd m3, m1
+ paddd m6, %1
+ paddd m4, %1
+ psrad m6, 14
+ psrad m4, 14
+ paddd m7, m6 ; t1
+ paddd m3, m4 ; t3
+
+ SWAP 1, 7
+%endmacro
+
+%macro IADST4_12BPP_FN 4
+cglobal vp9_%1_%3_4x4_add_12, 3, 3, 12, 2 * ARCH_X86_32 * mmsize, dst, stride, block, eob
+ mova m0, [blockq+0*16]
+ mova m1, [blockq+1*16]
+ mova m2, [blockq+2*16]
+ mova m3, [blockq+3*16]
+
+ PRELOAD 10, pd_8192, rnd
+ PRELOAD 11, pd_3fff, mask
+ %2_12BPP_1D reg_rnd, reg_mask
+ TRANSPOSE4x4D 0, 1, 2, 3, 4
+ %4_12BPP_1D reg_rnd, reg_mask
+
+ pxor m4, m4
+ ZERO_BLOCK blockq, 16, 4, m4
+
+ ; writeout
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ mova m5, [pw_4095]
+ mova m6, [pd_8]
+ ROUND_AND_STORE_4x4 0, 1, 2, 3, m4, m5, m6, 4
+ RET
+%endmacro
+
+INIT_XMM sse2
+IADST4_12BPP_FN idct, IDCT4, iadst, IADST4
+IADST4_12BPP_FN iadst, IADST4, idct, IDCT4
+IADST4_12BPP_FN iadst, IADST4, iadst, IADST4
+
+; the following line has not been executed at the end of this macro:
+; UNSCRATCH 6, 8, rsp+%3*mmsize
+%macro IDCT8_1D 1-5 [pd_8192], [pd_3fff], 2 * mmsize, 17 ; src, rnd, mask, src_stride, stack_offset
+ mova m0, [%1+0*%4]
+ mova m2, [%1+2*%4]
+ mova m4, [%1+4*%4]
+ mova m6, [%1+6*%4]
+ IDCT4_12BPP_1D %2, %3, 0, 2, 4, 6, 1, 3 ; m0/2/4/6 have t0/1/2/3
+ SCRATCH 4, 8, rsp+(%5+0)*mmsize
+ SCRATCH 6, 9, rsp+(%5+1)*mmsize
+ mova m1, [%1+1*%4]
+ mova m3, [%1+3*%4]
+ mova m5, [%1+5*%4]
+ mova m7, [%1+7*%4]
+ SUMSUB_MUL 1, 7, 4, 6, 16069, 3196, %2, %3 ; m1=t7a, m7=t4a
+ SUMSUB_MUL 5, 3, 4, 6, 9102, 13623, %2, %3 ; m5=t6a, m3=t5a
+ SUMSUB_BA d, 3, 7, 4 ; m3=t4, m7=t5a
+ SUMSUB_BA d, 5, 1, 4 ; m5=t7, m1=t6a
+ SUMSUB_MUL 1, 7, 4, 6, 11585, 11585, %2, %3 ; m1=t6, m7=t5
+ SUMSUB_BA d, 5, 0, 4 ; m5=out0, m0=out7
+ SUMSUB_BA d, 1, 2, 4 ; m1=out1, m2=out6
+ UNSCRATCH 4, 8, rsp+(%5+0)*mmsize
+ UNSCRATCH 6, 9, rsp+(%5+1)*mmsize
+ SCRATCH 2, 8, rsp+(%5+0)*mmsize
+ SUMSUB_BA d, 7, 4, 2 ; m7=out2, m4=out5
+ SUMSUB_BA d, 3, 6, 2 ; m3=out3, m6=out4
+ SWAP 0, 5, 4, 6, 2, 7
+%endmacro
+
+%macro STORE_2x8 5-7 dstq, strideq ; tmp1-2, reg, min, max
+ mova m%1, [%6+%7*0]
+ mova m%2, [%6+%7*1]
+ paddw m%1, m%3
+ paddw m%2, m%3
+ pmaxsw m%1, %4
+ pmaxsw m%2, %4
+ pminsw m%1, %5
+ pminsw m%2, %5
+ mova [%6+%7*0], m%1
+ mova [%6+%7*1], m%2
+%endmacro
+
+; FIXME we can use the intermediate storage (rsp[0-15]) on x86-32 for temp
+; storage also instead of allocating two more stack spaces. This doesn't
+; matter much but it's something...
+INIT_XMM sse2
+cglobal vp9_idct_idct_8x8_add_10, 4, 6 + ARCH_X86_64, 14, \
+ 16 * mmsize + 3 * ARCH_X86_32 * mmsize, \
+ dst, stride, block, eob
+ mova m0, [pw_1023]
+ cmp eobd, 1
+ jg .idctfull
+
+ ; dc-only - the 10bit version can be done entirely in 32bit, since the max
+ ; coef values are 16+sign bit, and the coef is 14bit, so 30+sign easily
+ ; fits in 32bit
+ DEFINE_ARGS dst, stride, block, coef
+ pxor m2, m2
+ DC_ONLY 5, m2
+ movd m1, coefd
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ DEFINE_ARGS dst, stride, cnt
+ mov cntd, 4
+.loop_dc:
+ STORE_2x8 3, 4, 1, m2, m0
+ lea dstq, [dstq+strideq*2]
+ dec cntd
+ jg .loop_dc
+ RET
+
+.idctfull:
+ SCRATCH 0, 12, rsp+16*mmsize, max
+ DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
+%if ARCH_X86_64
+ mov dstbakq, dstq
+ movsxd cntq, cntd
+%endif
+%ifdef PIC
+ lea ptrq, [default_8x8]
+ movzx cntd, byte [ptrq+cntq-1]
+%else
+ movzx cntd, byte [default_8x8+cntq-1]
+%endif
+ mov skipd, 2
+ sub skipd, cntd
+ mov ptrq, rsp
+ PRELOAD 10, pd_8192, rnd
+ PRELOAD 11, pd_3fff, mask
+ PRELOAD 13, pd_16, srnd
+.loop_1:
+ IDCT8_1D blockq, reg_rnd, reg_mask
+
+ TRANSPOSE4x4D 0, 1, 2, 3, 6
+ mova [ptrq+ 0*mmsize], m0
+ mova [ptrq+ 2*mmsize], m1
+ mova [ptrq+ 4*mmsize], m2
+ mova [ptrq+ 6*mmsize], m3
+ UNSCRATCH 6, 8, rsp+17*mmsize
+ TRANSPOSE4x4D 4, 5, 6, 7, 0
+ mova [ptrq+ 1*mmsize], m4
+ mova [ptrq+ 3*mmsize], m5
+ mova [ptrq+ 5*mmsize], m6
+ mova [ptrq+ 7*mmsize], m7
+ add ptrq, 8 * mmsize
+ add blockq, mmsize
+ dec cntd
+ jg .loop_1
+
+ ; zero-pad the remainder (skipped cols)
+ test skipd, skipd
+ jz .end
+ add skipd, skipd
+ lea blockq, [blockq+skipq*(mmsize/2)]
+ pxor m0, m0
+.loop_z:
+ mova [ptrq+mmsize*0], m0
+ mova [ptrq+mmsize*1], m0
+ mova [ptrq+mmsize*2], m0
+ mova [ptrq+mmsize*3], m0
+ add ptrq, 4 * mmsize
+ dec skipd
+ jg .loop_z
+.end:
+
+ DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+ lea stride3q, [strideq*3]
+ mov cntd, 2
+ mov ptrq, rsp
+.loop_2:
+ IDCT8_1D ptrq, reg_rnd, reg_mask
+
+ pxor m6, m6
+ ROUND_AND_STORE_4x4 0, 1, 2, 3, m6, reg_max, reg_srnd, 5
+ lea dstq, [dstq+strideq*4]
+ UNSCRATCH 0, 8, rsp+17*mmsize
+ UNSCRATCH 1, 12, rsp+16*mmsize, max
+ UNSCRATCH 2, 13, pd_16, srnd
+ ROUND_AND_STORE_4x4 4, 5, 0, 7, m6, m1, m2, 5
+ add ptrq, 16
+%if ARCH_X86_64
+ lea dstq, [dstbakq+8]
+%else
+ mov dstq, dstm
+ add dstq, 8
+%endif
+ dec cntd
+ jg .loop_2
+
+ ; m6 is still zero
+ ZERO_BLOCK blockq-2*mmsize, 32, 8, m6
+ RET
+
+%macro DC_ONLY_64BIT 2 ; shift, zero
+%if ARCH_X86_64
+ movsxd coefq, dword [blockq]
+ movd [blockq], %2
+ imul coefq, 11585
+ add coefq, 8192
+ sar coefq, 14
+ imul coefq, 11585
+ add coefq, ((1 << (%1 - 1)) << 14) + 8192
+ sar coefq, 14 + %1
+%else
+ mov coefd, dword [blockq]
+ movd [blockq], %2
+ DEFINE_ARGS dst, stride, cnt, coef, coefl
+ mov cntd, 2
+.loop_dc_calc:
+ mov coefld, coefd
+ sar coefd, 14
+ and coefld, 0x3fff
+ imul coefd, 11585
+ imul coefld, 11585
+ add coefld, 8192
+ sar coefld, 14
+ add coefd, coefld
+ dec cntd
+ jg .loop_dc_calc
+ add coefd, 1 << (%1 - 1)
+ sar coefd, %1
+%endif
+%endmacro
+
+INIT_XMM sse2
+cglobal vp9_idct_idct_8x8_add_12, 4, 6 + ARCH_X86_64, 14, \
+ 16 * mmsize + 3 * ARCH_X86_32 * mmsize, \
+ dst, stride, block, eob
+ mova m0, [pw_4095]
+ cmp eobd, 1
+ jg mangle(private_prefix %+ _ %+ vp9_idct_idct_8x8_add_10 %+ SUFFIX).idctfull
+
+ ; dc-only - unfortunately, this one can overflow, since coefs are 18+sign
+ ; bpp, and 18+14+sign does not fit in 32bit, so we do 2-stage multiplies
+ DEFINE_ARGS dst, stride, block, coef, coefl
+ pxor m2, m2
+ DC_ONLY_64BIT 5, m2
+ movd m1, coefd
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ DEFINE_ARGS dst, stride, cnt
+ mov cntd, 4
+.loop_dc:
+ STORE_2x8 3, 4, 1, m2, m0
+ lea dstq, [dstq+strideq*2]
+ dec cntd
+ jg .loop_dc
+ RET
+
+; inputs and outputs are dwords, coefficients are words
+;
+; dst1[hi]:dst3[lo] = src1 * coef1 + src2 * coef2
+; dst2[hi]:dst4[lo] = src1 * coef2 - src2 * coef1
+%macro SUMSUB_MUL_D 6-7 [pd_3fff] ; src/dst 1-2, dst3-4, coef1-2, mask
+ pand m%3, m%1, %7
+ pand m%4, m%2, %7
+ psrad m%1, 14
+ psrad m%2, 14
+ packssdw m%4, m%2
+ packssdw m%3, m%1
+ punpckhwd m%2, m%4, m%3
+ punpcklwd m%4, m%3
+ pmaddwd m%3, m%4, [pw_%6_%5]
+ pmaddwd m%1, m%2, [pw_%6_%5]
+ pmaddwd m%4, [pw_m%5_%6]
+ pmaddwd m%2, [pw_m%5_%6]
+%endmacro
+
+; dst1 = src2[hi]:src4[lo] + src1[hi]:src3[lo] + rnd >> 14
+; dst2 = src2[hi]:src4[lo] - src1[hi]:src3[lo] + rnd >> 14
+%macro SUMSUB_PACK_D 5-6 [pd_8192] ; src/dst 1-2, src3-4, tmp, rnd
+ SUMSUB_BA d, %1, %2, %5
+ SUMSUB_BA d, %3, %4, %5
+ paddd m%3, %6
+ paddd m%4, %6
+ psrad m%3, 14
+ psrad m%4, 14
+ paddd m%1, m%3
+ paddd m%2, m%4
+%endmacro
+
+%macro NEGD 1
+%if cpuflag(ssse3)
+ psignd %1, [pw_m1]
+%else
+ pxor %1, [pw_m1]
+ paddd %1, [pd_1]
+%endif
+%endmacro
+
+; the following line has not been executed at the end of this macro:
+; UNSCRATCH 6, 8, rsp+17*mmsize
+%macro IADST8_1D 1-3 [pd_8192], [pd_3fff] ; src, rnd, mask
+ mova m0, [%1+ 0*mmsize]
+ mova m3, [%1+ 6*mmsize]
+ mova m4, [%1+ 8*mmsize]
+ mova m7, [%1+14*mmsize]
+ SUMSUB_MUL_D 7, 0, 1, 2, 16305, 1606, %3 ; m7/1=t0a, m0/2=t1a
+ SUMSUB_MUL_D 3, 4, 5, 6, 10394, 12665, %3 ; m3/5=t4a, m4/6=t5a
+ SCRATCH 0, 8, rsp+17*mmsize
+ SUMSUB_PACK_D 3, 7, 5, 1, 0, %2 ; m3=t0, m7=t4
+ UNSCRATCH 0, 8, rsp+17*mmsize
+ SUMSUB_PACK_D 4, 0, 6, 2, 1, %2 ; m4=t1, m0=t5
+
+ SCRATCH 3, 8, rsp+17*mmsize
+ SCRATCH 4, 9, rsp+18*mmsize
+ SCRATCH 7, 10, rsp+19*mmsize
+ SCRATCH 0, 11, rsp+20*mmsize
+
+ mova m1, [%1+ 2*mmsize]
+ mova m2, [%1+ 4*mmsize]
+ mova m5, [%1+10*mmsize]
+ mova m6, [%1+12*mmsize]
+ SUMSUB_MUL_D 5, 2, 3, 4, 14449, 7723, %3 ; m5/8=t2a, m2/9=t3a
+ SUMSUB_MUL_D 1, 6, 7, 0, 4756, 15679, %3 ; m1/10=t6a, m6/11=t7a
+ SCRATCH 2, 12, rsp+21*mmsize
+ SUMSUB_PACK_D 1, 5, 7, 3, 2, %2 ; m1=t2, m5=t6
+ UNSCRATCH 2, 12, rsp+21*mmsize
+ SUMSUB_PACK_D 6, 2, 0, 4, 3, %2 ; m6=t3, m2=t7
+
+ UNSCRATCH 7, 10, rsp+19*mmsize
+ UNSCRATCH 0, 11, rsp+20*mmsize
+ SCRATCH 1, 10, rsp+19*mmsize
+ SCRATCH 6, 11, rsp+20*mmsize
+
+ SUMSUB_MUL_D 7, 0, 3, 4, 15137, 6270, %3 ; m7/8=t4a, m0/9=t5a
+ SUMSUB_MUL_D 2, 5, 1, 6, 6270, 15137, %3 ; m2/10=t7a, m5/11=t6a
+ SCRATCH 2, 12, rsp+21*mmsize
+ SUMSUB_PACK_D 5, 7, 6, 3, 2, %2 ; m5=-out1, m7=t6
+ UNSCRATCH 2, 12, rsp+21*mmsize
+ NEGD m5 ; m5=out1
+ SUMSUB_PACK_D 2, 0, 1, 4, 3, %2 ; m2=out6, m0=t7
+ SUMSUB_MUL 7, 0, 3, 4, 11585, 11585, %2, %3 ; m7=out2, m0=-out5
+ NEGD m0 ; m0=out5
+
+ UNSCRATCH 3, 8, rsp+17*mmsize
+ UNSCRATCH 4, 9, rsp+18*mmsize
+ UNSCRATCH 1, 10, rsp+19*mmsize
+ UNSCRATCH 6, 11, rsp+20*mmsize
+ SCRATCH 2, 8, rsp+17*mmsize
+ SCRATCH 0, 9, rsp+18*mmsize
+
+ SUMSUB_BA d, 1, 3, 2 ; m1=out0, m3=t2
+ SUMSUB_BA d, 6, 4, 2 ; m6=-out7, m4=t3
+ NEGD m6 ; m6=out7
+ SUMSUB_MUL 3, 4, 2, 0, 11585, 11585, %2, %3 ; m3=-out3, m4=out4
+ NEGD m3 ; m3=out3
+
+ UNSCRATCH 0, 9, rsp+18*mmsize
+
+ SWAP 0, 1, 5
+ SWAP 2, 7, 6
+%endmacro
+
+%macro IADST8_FN 5
+cglobal vp9_%1_%3_8x8_add_10, 4, 6 + ARCH_X86_64, 16, \
+ 16 * mmsize + ARCH_X86_32 * 6 * mmsize, \
+ dst, stride, block, eob
+ mova m0, [pw_1023]
+
+.body:
+ SCRATCH 0, 13, rsp+16*mmsize, max
+ DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
+%if ARCH_X86_64
+ mov dstbakq, dstq
+ movsxd cntq, cntd
+%endif
+%ifdef PIC
+ lea ptrq, [%5_8x8]
+ movzx cntd, byte [ptrq+cntq-1]
+%else
+ movzx cntd, byte [%5_8x8+cntq-1]
+%endif
+ mov skipd, 2
+ sub skipd, cntd
+ mov ptrq, rsp
+ PRELOAD 14, pd_8192, rnd
+ PRELOAD 15, pd_3fff, mask
+.loop_1:
+ %2_1D blockq, reg_rnd, reg_mask
+
+ TRANSPOSE4x4D 0, 1, 2, 3, 6
+ mova [ptrq+ 0*mmsize], m0
+ mova [ptrq+ 2*mmsize], m1
+ mova [ptrq+ 4*mmsize], m2
+ mova [ptrq+ 6*mmsize], m3
+ UNSCRATCH 6, 8, rsp+17*mmsize
+ TRANSPOSE4x4D 4, 5, 6, 7, 0
+ mova [ptrq+ 1*mmsize], m4
+ mova [ptrq+ 3*mmsize], m5
+ mova [ptrq+ 5*mmsize], m6
+ mova [ptrq+ 7*mmsize], m7
+ add ptrq, 8 * mmsize
+ add blockq, mmsize
+ dec cntd
+ jg .loop_1
+
+ ; zero-pad the remainder (skipped cols)
+ test skipd, skipd
+ jz .end
+ add skipd, skipd
+ lea blockq, [blockq+skipq*(mmsize/2)]
+ pxor m0, m0
+.loop_z:
+ mova [ptrq+mmsize*0], m0
+ mova [ptrq+mmsize*1], m0
+ mova [ptrq+mmsize*2], m0
+ mova [ptrq+mmsize*3], m0
+ add ptrq, 4 * mmsize
+ dec skipd
+ jg .loop_z
+.end:
+
+ DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+ lea stride3q, [strideq*3]
+ mov cntd, 2
+ mov ptrq, rsp
+.loop_2:
+ %4_1D ptrq, reg_rnd, reg_mask
+
+ pxor m6, m6
+ PRELOAD 9, pd_16, srnd
+ ROUND_AND_STORE_4x4 0, 1, 2, 3, m6, reg_max, reg_srnd, 5
+ lea dstq, [dstq+strideq*4]
+ UNSCRATCH 0, 8, rsp+17*mmsize
+ UNSCRATCH 1, 13, rsp+16*mmsize, max
+ UNSCRATCH 2, 9, pd_16, srnd
+ ROUND_AND_STORE_4x4 4, 5, 0, 7, m6, m1, m2, 5
+ add ptrq, 16
+%if ARCH_X86_64
+ lea dstq, [dstbakq+8]
+%else
+ mov dstq, dstm
+ add dstq, 8
+%endif
+ dec cntd
+ jg .loop_2
+
+ ; m6 is still zero
+ ZERO_BLOCK blockq-2*mmsize, 32, 8, m6
+ RET
+
+cglobal vp9_%1_%3_8x8_add_12, 4, 6 + ARCH_X86_64, 16, \
+ 16 * mmsize + ARCH_X86_32 * 6 * mmsize, \
+ dst, stride, block, eob
+ mova m0, [pw_4095]
+ jmp mangle(private_prefix %+ _ %+ vp9_%1_%3_8x8_add_10 %+ SUFFIX).body
+%endmacro
+
+INIT_XMM sse2
+IADST8_FN idct, IDCT8, iadst, IADST8, row
+IADST8_FN iadst, IADST8, idct, IDCT8, col
+IADST8_FN iadst, IADST8, iadst, IADST8, default
+
+%macro IDCT16_1D 1-4 4 * mmsize, 65, 67 ; src, src_stride, stack_offset, mm32bit_stack_offset
+ IDCT8_1D %1, [pd_8192], [pd_3fff], %2 * 2, %4 ; m0-3=t0-3a, m4-5/m8|r67/m7=t4-7
+ ; SCRATCH 6, 8, rsp+(%4+0)*mmsize ; t6
+ SCRATCH 0, 15, rsp+(%4+7)*mmsize ; t0a
+ SCRATCH 1, 14, rsp+(%4+6)*mmsize ; t1a
+ SCRATCH 2, 13, rsp+(%4+5)*mmsize ; t2a
+ SCRATCH 3, 12, rsp+(%4+4)*mmsize ; t3a
+ SCRATCH 4, 11, rsp+(%4+3)*mmsize ; t4
+ mova [rsp+(%3+0)*mmsize], m5 ; t5
+ mova [rsp+(%3+1)*mmsize], m7 ; t7
+
+ mova m0, [%1+ 1*%2] ; in1
+ mova m3, [%1+ 7*%2] ; in7
+ mova m4, [%1+ 9*%2] ; in9
+ mova m7, [%1+15*%2] ; in15
+
+ SUMSUB_MUL 0, 7, 1, 2, 16305, 1606 ; m0=t15a, m7=t8a
+ SUMSUB_MUL 4, 3, 1, 2, 10394, 12665 ; m4=t14a, m3=t9a
+ SUMSUB_BA d, 3, 7, 1 ; m3=t8, m7=t9
+ SUMSUB_BA d, 4, 0, 1 ; m4=t15,m0=t14
+ SUMSUB_MUL 0, 7, 1, 2, 15137, 6270 ; m0=t14a, m7=t9a
+
+ mova m1, [%1+ 3*%2] ; in3
+ mova m2, [%1+ 5*%2] ; in5
+ mova m5, [%1+11*%2] ; in11
+ mova m6, [%1+13*%2] ; in13
+
+ SCRATCH 0, 9, rsp+(%4+1)*mmsize
+ SCRATCH 7, 10, rsp+(%4+2)*mmsize
+
+ SUMSUB_MUL 2, 5, 0, 7, 14449, 7723 ; m2=t13a, m5=t10a
+ SUMSUB_MUL 6, 1, 0, 7, 4756, 15679 ; m6=t12a, m1=t11a
+ SUMSUB_BA d, 5, 1, 0 ; m5=t11,m1=t10
+ SUMSUB_BA d, 2, 6, 0 ; m2=t12,m6=t13
+ NEGD m1 ; m1=-t10
+ SUMSUB_MUL 1, 6, 0, 7, 15137, 6270 ; m1=t13a, m6=t10a
+
+ UNSCRATCH 7, 10, rsp+(%4+2)*mmsize
+ SUMSUB_BA d, 5, 3, 0 ; m5=t8a, m3=t11a
+ SUMSUB_BA d, 6, 7, 0 ; m6=t9, m7=t10
+ SUMSUB_BA d, 2, 4, 0 ; m2=t15a,m4=t12a
+ SCRATCH 5, 10, rsp+(%4+2)*mmsize
+ SUMSUB_MUL 4, 3, 0, 5, 11585, 11585 ; m4=t12, m3=t11
+ UNSCRATCH 0, 9, rsp+(%4+1)*mmsize
+ SUMSUB_BA d, 1, 0, 5 ; m1=t14, m0=t13
+ SCRATCH 6, 9, rsp+(%4+1)*mmsize
+ SUMSUB_MUL 0, 7, 6, 5, 11585, 11585 ; m0=t13a,m7=t10a
+
+ ; order: 15|r74,14|r73,13|r72,12|r71,11|r70,r65,8|r67,r66,10|r69,9|r68,7,3,4,0,1,2
+ ; free: 6,5
+
+ UNSCRATCH 5, 15, rsp+(%4+7)*mmsize
+ SUMSUB_BA d, 2, 5, 6 ; m2=out0, m5=out15
+ SCRATCH 5, 15, rsp+(%4+7)*mmsize
+ UNSCRATCH 5, 14, rsp+(%4+6)*mmsize
+ SUMSUB_BA d, 1, 5, 6 ; m1=out1, m5=out14
+ SCRATCH 5, 14, rsp+(%4+6)*mmsize
+ UNSCRATCH 5, 13, rsp+(%4+5)*mmsize
+ SUMSUB_BA d, 0, 5, 6 ; m0=out2, m5=out13
+ SCRATCH 5, 13, rsp+(%4+5)*mmsize
+ UNSCRATCH 5, 12, rsp+(%4+4)*mmsize
+ SUMSUB_BA d, 4, 5, 6 ; m4=out3, m5=out12
+ SCRATCH 5, 12, rsp+(%4+4)*mmsize
+ UNSCRATCH 5, 11, rsp+(%4+3)*mmsize
+ SUMSUB_BA d, 3, 5, 6 ; m3=out4, m5=out11
+ SCRATCH 4, 11, rsp+(%4+3)*mmsize
+ mova m4, [rsp+(%3+0)*mmsize]
+ SUMSUB_BA d, 7, 4, 6 ; m7=out5, m4=out10
+ mova [rsp+(%3+0)*mmsize], m5
+ UNSCRATCH 5, 8, rsp+(%4+0)*mmsize
+ UNSCRATCH 6, 9, rsp+(%4+1)*mmsize
+ SCRATCH 2, 8, rsp+(%4+0)*mmsize
+ SCRATCH 1, 9, rsp+(%4+1)*mmsize
+ UNSCRATCH 1, 10, rsp+(%4+2)*mmsize
+ SCRATCH 0, 10, rsp+(%4+2)*mmsize
+ mova m0, [rsp+(%3+1)*mmsize]
+ SUMSUB_BA d, 6, 5, 2 ; m6=out6, m5=out9
+ SUMSUB_BA d, 1, 0, 2 ; m1=out7, m0=out8
+
+ SWAP 0, 3, 1, 7, 2, 6, 4
+
+ ; output order: 8-11|r67-70=out0-3
+ ; 0-6,r65=out4-11
+ ; 12-15|r71-74=out12-15
+%endmacro
+
+INIT_XMM sse2
+cglobal vp9_idct_idct_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
+ 67 * mmsize + ARCH_X86_32 * 8 * mmsize, \
+ dst, stride, block, eob
+ mova m0, [pw_1023]
+ cmp eobd, 1
+ jg .idctfull
+
+ ; dc-only - the 10bit version can be done entirely in 32bit, since the max
+ ; coef values are 17+sign bit, and the coef is 14bit, so 31+sign easily
+ ; fits in 32bit
+ DEFINE_ARGS dst, stride, block, coef
+ pxor m2, m2
+ DC_ONLY 6, m2
+ movd m1, coefd
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ DEFINE_ARGS dst, stride, cnt
+ mov cntd, 8
+.loop_dc:
+ STORE_2x8 3, 4, 1, m2, m0, dstq, mmsize
+ STORE_2x8 3, 4, 1, m2, m0, dstq+strideq, mmsize
+ lea dstq, [dstq+strideq*2]
+ dec cntd
+ jg .loop_dc
+ RET
+
+.idctfull:
+ mova [rsp+64*mmsize], m0
+ DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
+%if ARCH_X86_64
+ mov dstbakq, dstq
+ movsxd cntq, cntd
+%endif
+%ifdef PIC
+ lea ptrq, [default_16x16]
+ movzx cntd, byte [ptrq+cntq-1]
+%else
+ movzx cntd, byte [default_16x16+cntq-1]
+%endif
+ mov skipd, 4
+ sub skipd, cntd
+ mov ptrq, rsp
+.loop_1:
+ IDCT16_1D blockq
+
+ TRANSPOSE4x4D 0, 1, 2, 3, 7
+ mova [ptrq+ 1*mmsize], m0
+ mova [ptrq+ 5*mmsize], m1
+ mova [ptrq+ 9*mmsize], m2
+ mova [ptrq+13*mmsize], m3
+ mova m7, [rsp+65*mmsize]
+ TRANSPOSE4x4D 4, 5, 6, 7, 0
+ mova [ptrq+ 2*mmsize], m4
+ mova [ptrq+ 6*mmsize], m5
+ mova [ptrq+10*mmsize], m6
+ mova [ptrq+14*mmsize], m7
+ UNSCRATCH 0, 8, rsp+67*mmsize
+ UNSCRATCH 1, 9, rsp+68*mmsize
+ UNSCRATCH 2, 10, rsp+69*mmsize
+ UNSCRATCH 3, 11, rsp+70*mmsize
+ TRANSPOSE4x4D 0, 1, 2, 3, 7
+ mova [ptrq+ 0*mmsize], m0
+ mova [ptrq+ 4*mmsize], m1
+ mova [ptrq+ 8*mmsize], m2
+ mova [ptrq+12*mmsize], m3
+ UNSCRATCH 4, 12, rsp+71*mmsize
+ UNSCRATCH 5, 13, rsp+72*mmsize
+ UNSCRATCH 6, 14, rsp+73*mmsize
+ UNSCRATCH 7, 15, rsp+74*mmsize
+ TRANSPOSE4x4D 4, 5, 6, 7, 0
+ mova [ptrq+ 3*mmsize], m4
+ mova [ptrq+ 7*mmsize], m5
+ mova [ptrq+11*mmsize], m6
+ mova [ptrq+15*mmsize], m7
+ add ptrq, 16 * mmsize
+ add blockq, mmsize
+ dec cntd
+ jg .loop_1
+
+ ; zero-pad the remainder (skipped cols)
+ test skipd, skipd
+ jz .end
+ add skipd, skipd
+ lea blockq, [blockq+skipq*(mmsize/2)]
+ pxor m0, m0
+.loop_z:
+ mova [ptrq+mmsize*0], m0
+ mova [ptrq+mmsize*1], m0
+ mova [ptrq+mmsize*2], m0
+ mova [ptrq+mmsize*3], m0
+ mova [ptrq+mmsize*4], m0
+ mova [ptrq+mmsize*5], m0
+ mova [ptrq+mmsize*6], m0
+ mova [ptrq+mmsize*7], m0
+ add ptrq, 8 * mmsize
+ dec skipd
+ jg .loop_z
+.end:
+
+ DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+ lea stride3q, [strideq*3]
+ mov cntd, 4
+ mov ptrq, rsp
+.loop_2:
+ IDCT16_1D ptrq
+
+ pxor m7, m7
+ lea dstq, [dstq+strideq*4]
+ ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+64*mmsize], [pd_32], 6
+ lea dstq, [dstq+strideq*4]
+ mova m0, [rsp+65*mmsize]
+ mova m1, [rsp+64*mmsize]
+ mova m2, [pd_32]
+ ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, m1, m2, 6
+
+%if ARCH_X86_64
+ DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst
+%else
+ mov dstq, dstm
+%endif
+ UNSCRATCH 0, 8, rsp+67*mmsize
+ UNSCRATCH 4, 9, rsp+68*mmsize
+ UNSCRATCH 5, 10, rsp+69*mmsize
+ UNSCRATCH 3, 11, rsp+70*mmsize
+ ROUND_AND_STORE_4x4 0, 4, 5, 3, m7, m1, m2, 6
+%if ARCH_X86_64
+ DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+ lea dstq, [dstbakq+stride3q*4]
+%else
+ lea dstq, [dstq+stride3q*4]
+%endif
+ UNSCRATCH 4, 12, rsp+71*mmsize
+ UNSCRATCH 5, 13, rsp+72*mmsize
+ UNSCRATCH 6, 14, rsp+73*mmsize
+ UNSCRATCH 0, 15, rsp+74*mmsize
+ ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, m1, m2, 6
+
+ add ptrq, mmsize
+%if ARCH_X86_64
+ add dstbakq, 8
+ mov dstq, dstbakq
+%else
+ add dword dstm, 8
+ mov dstq, dstm
+%endif
+ dec cntd
+ jg .loop_2
+
+ ; m7 is still zero
+ ZERO_BLOCK blockq-4*mmsize, 64, 16, m7
+ RET
+
+INIT_XMM sse2
+cglobal vp9_idct_idct_16x16_add_12, 4, 6 + ARCH_X86_64, 16, \
+ 67 * mmsize + ARCH_X86_32 * 8 * mmsize, \
+ dst, stride, block, eob
+ mova m0, [pw_4095]
+ cmp eobd, 1
+ jg mangle(private_prefix %+ _ %+ vp9_idct_idct_16x16_add_10 %+ SUFFIX).idctfull
+
+ ; dc-only - unfortunately, this one can overflow, since coefs are 19+sign
+ ; bpp, and 19+14+sign does not fit in 32bit, so we do 2-stage multiplies
+ DEFINE_ARGS dst, stride, block, coef, coefl
+ pxor m2, m2
+ DC_ONLY_64BIT 6, m2
+ movd m1, coefd
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ DEFINE_ARGS dst, stride, cnt
+ mov cntd, 8
+.loop_dc:
+ STORE_2x8 3, 4, 1, m2, m0, dstq, mmsize
+ STORE_2x8 3, 4, 1, m2, m0, dstq+strideq, mmsize
+ lea dstq, [dstq+strideq*2]
+ dec cntd
+ jg .loop_dc
+ RET
+
+; r65-69 are available for spills
+; r70-77 are available on x86-32 only (x86-64 should use m8-15)
+; output should be in m8-11|r70-73, m0-6,r65 and m12-15|r74-77
+%macro IADST16_1D 1 ; src
+ mova m0, [%1+ 0*4*mmsize] ; in0
+ mova m1, [%1+ 7*4*mmsize] ; in7
+ mova m2, [%1+ 8*4*mmsize] ; in8
+ mova m3, [%1+15*4*mmsize] ; in15
+ SUMSUB_MUL_D 3, 0, 4, 5, 16364, 804 ; m3/4=t0, m0/5=t1
+ SUMSUB_MUL_D 1, 2, 6, 7, 11003, 12140 ; m1/6=t8, m2/7=t9
+ SCRATCH 0, 8, rsp+70*mmsize
+ SUMSUB_PACK_D 1, 3, 6, 4, 0 ; m1=t0a, m3=t8a
+ UNSCRATCH 0, 8, rsp+70*mmsize
+ SUMSUB_PACK_D 2, 0, 7, 5, 4 ; m2=t1a, m0=t9a
+ mova [rsp+67*mmsize], m1
+ SCRATCH 2, 9, rsp+71*mmsize
+ SCRATCH 3, 12, rsp+74*mmsize
+ SCRATCH 0, 13, rsp+75*mmsize
+
+ mova m0, [%1+ 3*4*mmsize] ; in3
+ mova m1, [%1+ 4*4*mmsize] ; in4
+ mova m2, [%1+11*4*mmsize] ; in11
+ mova m3, [%1+12*4*mmsize] ; in12
+ SUMSUB_MUL_D 2, 1, 4, 5, 14811, 7005 ; m2/4=t4, m1/5=t5
+ SUMSUB_MUL_D 0, 3, 6, 7, 5520, 15426 ; m0/6=t12, m3/7=t13
+ SCRATCH 1, 10, rsp+72*mmsize
+ SUMSUB_PACK_D 0, 2, 6, 4, 1 ; m0=t4a, m2=t12a
+ UNSCRATCH 1, 10, rsp+72*mmsize
+ SUMSUB_PACK_D 3, 1, 7, 5, 4 ; m3=t5a, m1=t13a
+ SCRATCH 0, 15, rsp+77*mmsize
+ SCRATCH 3, 11, rsp+73*mmsize
+
+ UNSCRATCH 0, 12, rsp+74*mmsize ; t8a
+ UNSCRATCH 3, 13, rsp+75*mmsize ; t9a
+ SUMSUB_MUL_D 0, 3, 4, 5, 16069, 3196 ; m0/4=t8, m3/5=t9
+ SUMSUB_MUL_D 1, 2, 6, 7, 3196, 16069 ; m1/6=t13, m2/7=t12
+ SCRATCH 1, 12, rsp+74*mmsize
+ SUMSUB_PACK_D 2, 0, 7, 4, 1 ; m2=t8a, m0=t12a
+ UNSCRATCH 1, 12, rsp+74*mmsize
+ SUMSUB_PACK_D 1, 3, 6, 5, 4 ; m1=t9a, m3=t13a
+ mova [rsp+65*mmsize], m2
+ mova [rsp+66*mmsize], m1
+ SCRATCH 0, 8, rsp+70*mmsize
+ SCRATCH 3, 12, rsp+74*mmsize
+
+ mova m0, [%1+ 2*4*mmsize] ; in2
+ mova m1, [%1+ 5*4*mmsize] ; in5
+ mova m2, [%1+10*4*mmsize] ; in10
+ mova m3, [%1+13*4*mmsize] ; in13
+ SUMSUB_MUL_D 3, 0, 4, 5, 15893, 3981 ; m3/4=t2, m0/5=t3
+ SUMSUB_MUL_D 1, 2, 6, 7, 8423, 14053 ; m1/6=t10, m2/7=t11
+ SCRATCH 0, 10, rsp+72*mmsize
+ SUMSUB_PACK_D 1, 3, 6, 4, 0 ; m1=t2a, m3=t10a
+ UNSCRATCH 0, 10, rsp+72*mmsize
+ SUMSUB_PACK_D 2, 0, 7, 5, 4 ; m2=t3a, m0=t11a
+ mova [rsp+68*mmsize], m1
+ mova [rsp+69*mmsize], m2
+ SCRATCH 3, 13, rsp+75*mmsize
+ SCRATCH 0, 14, rsp+76*mmsize
+
+ mova m0, [%1+ 1*4*mmsize] ; in1
+ mova m1, [%1+ 6*4*mmsize] ; in6
+ mova m2, [%1+ 9*4*mmsize] ; in9
+ mova m3, [%1+14*4*mmsize] ; in14
+ SUMSUB_MUL_D 2, 1, 4, 5, 13160, 9760 ; m2/4=t6, m1/5=t7
+ SUMSUB_MUL_D 0, 3, 6, 7, 2404, 16207 ; m0/6=t14, m3/7=t15
+ SCRATCH 1, 10, rsp+72*mmsize
+ SUMSUB_PACK_D 0, 2, 6, 4, 1 ; m0=t6a, m2=t14a
+ UNSCRATCH 1, 10, rsp+72*mmsize
+ SUMSUB_PACK_D 3, 1, 7, 5, 4 ; m3=t7a, m1=t15a
+
+ UNSCRATCH 4, 13, rsp+75*mmsize ; t10a
+ UNSCRATCH 5, 14, rsp+76*mmsize ; t11a
+ SCRATCH 0, 13, rsp+75*mmsize
+ SCRATCH 3, 14, rsp+76*mmsize
+ SUMSUB_MUL_D 4, 5, 6, 7, 9102, 13623 ; m4/6=t10, m5/7=t11
+ SUMSUB_MUL_D 1, 2, 0, 3, 13623, 9102 ; m1/0=t15, m2/3=t14
+ SCRATCH 0, 10, rsp+72*mmsize
+ SUMSUB_PACK_D 2, 4, 3, 6, 0 ; m2=t10a, m4=t14a
+ UNSCRATCH 0, 10, rsp+72*mmsize
+ SUMSUB_PACK_D 1, 5, 0, 7, 6 ; m1=t11a, m5=t15a
+
+ UNSCRATCH 0, 8, rsp+70*mmsize ; t12a
+ UNSCRATCH 3, 12, rsp+74*mmsize ; t13a
+ SCRATCH 2, 8, rsp+70*mmsize
+ SCRATCH 1, 12, rsp+74*mmsize
+ SUMSUB_MUL_D 0, 3, 1, 2, 15137, 6270 ; m0/1=t12, m3/2=t13
+ SUMSUB_MUL_D 5, 4, 7, 6, 6270, 15137 ; m5/7=t15, m4/6=t14
+ SCRATCH 2, 10, rsp+72*mmsize
+ SUMSUB_PACK_D 4, 0, 6, 1, 2 ; m4=out2, m0=t14a
+ UNSCRATCH 2, 10, rsp+72*mmsize
+ SUMSUB_PACK_D 5, 3, 7, 2, 1 ; m5=-out13, m3=t15a
+ NEGD m5 ; m5=out13
+
+ UNSCRATCH 1, 9, rsp+71*mmsize ; t1a
+ mova m2, [rsp+68*mmsize] ; t2a
+ UNSCRATCH 6, 13, rsp+75*mmsize ; t6a
+ UNSCRATCH 7, 14, rsp+76*mmsize ; t7a
+ SCRATCH 4, 10, rsp+72*mmsize
+ SCRATCH 5, 13, rsp+75*mmsize
+ UNSCRATCH 4, 15, rsp+77*mmsize ; t4a
+ UNSCRATCH 5, 11, rsp+73*mmsize ; t5a
+ SCRATCH 0, 14, rsp+76*mmsize
+ SCRATCH 3, 15, rsp+77*mmsize
+ mova m0, [rsp+67*mmsize] ; t0a
+ SUMSUB_BA d, 4, 0, 3 ; m4=t0, m0=t4
+ SUMSUB_BA d, 5, 1, 3 ; m5=t1, m1=t5
+ SUMSUB_BA d, 6, 2, 3 ; m6=t2, m2=t6
+ SCRATCH 4, 9, rsp+71*mmsize
+ mova m3, [rsp+69*mmsize] ; t3a
+ SUMSUB_BA d, 7, 3, 4 ; m7=t3, m3=t7
+
+ mova [rsp+67*mmsize], m5
+ mova [rsp+68*mmsize], m6
+ mova [rsp+69*mmsize], m7
+ SUMSUB_MUL_D 0, 1, 4, 5, 15137, 6270 ; m0/4=t4a, m1/5=t5a
+ SUMSUB_MUL_D 3, 2, 7, 6, 6270, 15137 ; m3/7=t7a, m2/6=t6a
+ SCRATCH 1, 11, rsp+73*mmsize
+ SUMSUB_PACK_D 2, 0, 6, 4, 1 ; m2=-out3, m0=t6
+ NEGD m2 ; m2=out3
+ UNSCRATCH 1, 11, rsp+73*mmsize
+ SUMSUB_PACK_D 3, 1, 7, 5, 4 ; m3=out12, m1=t7
+ SCRATCH 2, 11, rsp+73*mmsize
+ UNSCRATCH 2, 12, rsp+74*mmsize ; t11a
+ SCRATCH 3, 12, rsp+74*mmsize
+
+ UNSCRATCH 3, 8, rsp+70*mmsize ; t10a
+ mova m4, [rsp+65*mmsize] ; t8a
+ mova m5, [rsp+66*mmsize] ; t9a
+ SUMSUB_BA d, 3, 4, 6 ; m3=-out1, m4=t10
+ NEGD m3 ; m3=out1
+ SUMSUB_BA d, 2, 5, 6 ; m2=out14, m5=t11
+ UNSCRATCH 6, 9, rsp+71*mmsize ; t0
+ UNSCRATCH 7, 14, rsp+76*mmsize ; t14a
+ SCRATCH 3, 9, rsp+71*mmsize
+ SCRATCH 2, 14, rsp+76*mmsize
+
+ SUMSUB_MUL 1, 0, 2, 3, 11585, 11585 ; m1=out4, m0=out11
+ mova [rsp+65*mmsize], m0
+ SUMSUB_MUL 5, 4, 2, 3, 11585, 11585 ; m5=out6, m4=out9
+ UNSCRATCH 0, 15, rsp+77*mmsize ; t15a
+ SUMSUB_MUL 7, 0, 2, 3, 11585, m11585 ; m7=out10, m0=out5
+
+ mova m2, [rsp+68*mmsize] ; t2
+ SUMSUB_BA d, 2, 6, 3 ; m2=out0, m6=t2a
+ SCRATCH 2, 8, rsp+70*mmsize
+ mova m2, [rsp+67*mmsize] ; t1
+ mova m3, [rsp+69*mmsize] ; t3
+ mova [rsp+67*mmsize], m7
+ SUMSUB_BA d, 3, 2, 7 ; m3=-out15, m2=t3a
+ NEGD m3 ; m3=out15
+ SCRATCH 3, 15, rsp+77*mmsize
+ SUMSUB_MUL 6, 2, 7, 3, 11585, m11585 ; m6=out8, m2=out7
+ mova m7, [rsp+67*mmsize]
+
+ SWAP 0, 1
+ SWAP 2, 5, 4, 6, 7, 3
+%endmacro
+
+%macro IADST16_FN 7
+cglobal vp9_%1_%4_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
+ 70 * mmsize + ARCH_X86_32 * 8 * mmsize, \
+ dst, stride, block, eob
+ mova m0, [pw_1023]
+
+.body:
+ mova [rsp+64*mmsize], m0
+ DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
+%if ARCH_X86_64
+ mov dstbakq, dstq
+ movsxd cntq, cntd
+%endif
+%ifdef PIC
+ lea ptrq, [%7_16x16]
+ movzx cntd, byte [ptrq+cntq-1]
+%else
+ movzx cntd, byte [%7_16x16+cntq-1]
+%endif
+ mov skipd, 4
+ sub skipd, cntd
+ mov ptrq, rsp
+.loop_1:
+ %2_1D blockq
+
+ TRANSPOSE4x4D 0, 1, 2, 3, 7
+ mova [ptrq+ 1*mmsize], m0
+ mova [ptrq+ 5*mmsize], m1
+ mova [ptrq+ 9*mmsize], m2
+ mova [ptrq+13*mmsize], m3
+ mova m7, [rsp+65*mmsize]
+ TRANSPOSE4x4D 4, 5, 6, 7, 0
+ mova [ptrq+ 2*mmsize], m4
+ mova [ptrq+ 6*mmsize], m5
+ mova [ptrq+10*mmsize], m6
+ mova [ptrq+14*mmsize], m7
+ UNSCRATCH 0, 8, rsp+(%3+0)*mmsize
+ UNSCRATCH 1, 9, rsp+(%3+1)*mmsize
+ UNSCRATCH 2, 10, rsp+(%3+2)*mmsize
+ UNSCRATCH 3, 11, rsp+(%3+3)*mmsize
+ TRANSPOSE4x4D 0, 1, 2, 3, 7
+ mova [ptrq+ 0*mmsize], m0
+ mova [ptrq+ 4*mmsize], m1
+ mova [ptrq+ 8*mmsize], m2
+ mova [ptrq+12*mmsize], m3
+ UNSCRATCH 4, 12, rsp+(%3+4)*mmsize
+ UNSCRATCH 5, 13, rsp+(%3+5)*mmsize
+ UNSCRATCH 6, 14, rsp+(%3+6)*mmsize
+ UNSCRATCH 7, 15, rsp+(%3+7)*mmsize
+ TRANSPOSE4x4D 4, 5, 6, 7, 0
+ mova [ptrq+ 3*mmsize], m4
+ mova [ptrq+ 7*mmsize], m5
+ mova [ptrq+11*mmsize], m6
+ mova [ptrq+15*mmsize], m7
+ add ptrq, 16 * mmsize
+ add blockq, mmsize
+ dec cntd
+ jg .loop_1
+
+ ; zero-pad the remainder (skipped cols)
+ test skipd, skipd
+ jz .end
+ add skipd, skipd
+ lea blockq, [blockq+skipq*(mmsize/2)]
+ pxor m0, m0
+.loop_z:
+ mova [ptrq+mmsize*0], m0
+ mova [ptrq+mmsize*1], m0
+ mova [ptrq+mmsize*2], m0
+ mova [ptrq+mmsize*3], m0
+ mova [ptrq+mmsize*4], m0
+ mova [ptrq+mmsize*5], m0
+ mova [ptrq+mmsize*6], m0
+ mova [ptrq+mmsize*7], m0
+ add ptrq, 8 * mmsize
+ dec skipd
+ jg .loop_z
+.end:
+
+ DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+ lea stride3q, [strideq*3]
+ mov cntd, 4
+ mov ptrq, rsp
+.loop_2:
+ %5_1D ptrq
+
+ pxor m7, m7
+ lea dstq, [dstq+strideq*4]
+ ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+64*mmsize], [pd_32], 6
+ lea dstq, [dstq+strideq*4]
+ mova m0, [rsp+65*mmsize]
+ mova m1, [rsp+64*mmsize]
+ mova m2, [pd_32]
+ ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, m1, m2, 6
+
+%if ARCH_X86_64
+ DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst
+%else
+ mov dstq, dstm
+%endif
+ UNSCRATCH 0, 8, rsp+(%6+0)*mmsize
+ UNSCRATCH 4, 9, rsp+(%6+1)*mmsize
+ UNSCRATCH 5, 10, rsp+(%6+2)*mmsize
+ UNSCRATCH 3, 11, rsp+(%6+3)*mmsize
+ ROUND_AND_STORE_4x4 0, 4, 5, 3, m7, m1, m2, 6
+%if ARCH_X86_64
+ DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+ lea dstq, [dstbakq+stride3q*4]
+%else
+ lea dstq, [dstq+stride3q*4]
+%endif
+ UNSCRATCH 4, 12, rsp+(%6+4)*mmsize
+ UNSCRATCH 5, 13, rsp+(%6+5)*mmsize
+ UNSCRATCH 6, 14, rsp+(%6+6)*mmsize
+ UNSCRATCH 0, 15, rsp+(%6+7)*mmsize
+ ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, m1, m2, 6
+
+ add ptrq, mmsize
+%if ARCH_X86_64
+ add dstbakq, 8
+ mov dstq, dstbakq
+%else
+ add dword dstm, 8
+ mov dstq, dstm
+%endif
+ dec cntd
+ jg .loop_2
+
+ ; m7 is still zero
+ ZERO_BLOCK blockq-4*mmsize, 64, 16, m7
+ RET
+
+cglobal vp9_%1_%4_16x16_add_12, 4, 6 + ARCH_X86_64, 16, \
+ 70 * mmsize + ARCH_X86_32 * 8 * mmsize, \
+ dst, stride, block, eob
+ mova m0, [pw_4095]
+ jmp mangle(private_prefix %+ _ %+ vp9_%1_%4_16x16_add_10 %+ SUFFIX).body
+%endmacro
+
+INIT_XMM sse2
+IADST16_FN idct, IDCT16, 67, iadst, IADST16, 70, row
+IADST16_FN iadst, IADST16, 70, idct, IDCT16, 67, col
+IADST16_FN iadst, IADST16, 70, iadst, IADST16, 70, default
+
+%macro IDCT32_1D 2-3 8 * mmsize; pass[1/2], src, src_stride
+ IDCT16_1D %2, 2 * %3, 272, 257
+%if ARCH_X86_64
+ mova [rsp+257*mmsize], m8
+ mova [rsp+258*mmsize], m9
+ mova [rsp+259*mmsize], m10
+ mova [rsp+260*mmsize], m11
+ mova [rsp+261*mmsize], m12
+ mova [rsp+262*mmsize], m13
+ mova [rsp+263*mmsize], m14
+ mova [rsp+264*mmsize], m15
+%endif
+ mova [rsp+265*mmsize], m0
+ mova [rsp+266*mmsize], m1
+ mova [rsp+267*mmsize], m2
+ mova [rsp+268*mmsize], m3
+ mova [rsp+269*mmsize], m4
+ mova [rsp+270*mmsize], m5
+ mova [rsp+271*mmsize], m6
+
+ ; r257-260: t0-3
+ ; r265-272: t4/5a/6a/7/8/9a/10/11a
+ ; r261-264: t12a/13/14a/15
+ ; r273-274 is free as scratch space, and 275-282 mirrors m8-15 on 32bit
+
+ mova m0, [%2+ 1*%3] ; in1
+ mova m1, [%2+15*%3] ; in15
+ mova m2, [%2+17*%3] ; in17
+ mova m3, [%2+31*%3] ; in31
+ SUMSUB_MUL 0, 3, 4, 5, 16364, 804 ; m0=t31a, m3=t16a
+ SUMSUB_MUL 2, 1, 4, 5, 11003, 12140 ; m2=t30a, m1=t17a
+ SUMSUB_BA d, 1, 3, 4 ; m1=t16, m3=t17
+ SUMSUB_BA d, 2, 0, 4 ; m2=t31, m0=t30
+ SUMSUB_MUL 0, 3, 4, 5, 16069, 3196 ; m0=t30a, m3=t17a
+ SCRATCH 0, 8, rsp+275*mmsize
+ SCRATCH 2, 9, rsp+276*mmsize
+
+ ; end of stage 1-3 first quart
+
+ mova m0, [%2+ 7*%3] ; in7
+ mova m2, [%2+ 9*%3] ; in9
+ mova m4, [%2+23*%3] ; in23
+ mova m5, [%2+25*%3] ; in25
+ SUMSUB_MUL 2, 4, 6, 7, 14811, 7005 ; m2=t29a, m4=t18a
+ SUMSUB_MUL 5, 0, 6, 7, 5520, 15426 ; m5=t28a, m0=t19a
+ SUMSUB_BA d, 4, 0, 6 ; m4=t19, m0=t18
+ SUMSUB_BA d, 2, 5, 6 ; m2=t28, m5=t29
+ SUMSUB_MUL 5, 0, 6, 7, 3196, m16069 ; m5=t29a, m0=t18a
+
+ ; end of stage 1-3 second quart
+
+ SUMSUB_BA d, 4, 1, 6 ; m4=t16a, m1=t19a
+ SUMSUB_BA d, 0, 3, 6 ; m0=t17, m3=t18
+ UNSCRATCH 6, 8, rsp+275*mmsize ; t30a
+ UNSCRATCH 7, 9, rsp+276*mmsize ; t31
+ mova [rsp+273*mmsize], m4
+ mova [rsp+274*mmsize], m0
+ SUMSUB_BA d, 2, 7, 0 ; m2=t31a, m7=t28a
+ SUMSUB_BA d, 5, 6, 0 ; m5=t30, m6=t29
+ SUMSUB_MUL 6, 3, 0, 4, 15137, 6270 ; m6=t29a, m3=t18a
+ SUMSUB_MUL 7, 1, 0, 4, 15137, 6270 ; m7=t28, m1=t19
+ SCRATCH 3, 10, rsp+277*mmsize
+ SCRATCH 1, 11, rsp+278*mmsize
+ SCRATCH 7, 12, rsp+279*mmsize
+ SCRATCH 6, 13, rsp+280*mmsize
+ SCRATCH 5, 14, rsp+281*mmsize
+ SCRATCH 2, 15, rsp+282*mmsize
+
+ ; end of stage 4-5 first half
+
+ mova m0, [%2+ 5*%3] ; in5
+ mova m1, [%2+11*%3] ; in11
+ mova m2, [%2+21*%3] ; in21
+ mova m3, [%2+27*%3] ; in27
+ SUMSUB_MUL 0, 3, 4, 5, 15893, 3981 ; m0=t27a, m3=t20a
+ SUMSUB_MUL 2, 1, 4, 5, 8423, 14053 ; m2=t26a, m1=t21a
+ SUMSUB_BA d, 1, 3, 4 ; m1=t20, m3=t21
+ SUMSUB_BA d, 2, 0, 4 ; m2=t27, m0=t26
+ SUMSUB_MUL 0, 3, 4, 5, 9102, 13623 ; m0=t26a, m3=t21a
+ SCRATCH 0, 8, rsp+275*mmsize
+ SCRATCH 2, 9, rsp+276*mmsize
+
+ ; end of stage 1-3 third quart
+
+ mova m0, [%2+ 3*%3] ; in3
+ mova m2, [%2+13*%3] ; in13
+ mova m4, [%2+19*%3] ; in19
+ mova m5, [%2+29*%3] ; in29
+ SUMSUB_MUL 2, 4, 6, 7, 13160, 9760 ; m2=t25a, m4=t22a
+ SUMSUB_MUL 5, 0, 6, 7, 2404, 16207 ; m5=t24a, m0=t23a
+ SUMSUB_BA d, 4, 0, 6 ; m4=t23, m0=t22
+ SUMSUB_BA d, 2, 5, 6 ; m2=t24, m5=t25
+ SUMSUB_MUL 5, 0, 6, 7, 13623, m9102 ; m5=t25a, m0=t22a
+
+ ; end of stage 1-3 fourth quart
+
+ SUMSUB_BA d, 1, 4, 6 ; m1=t23a, m4=t20a
+ SUMSUB_BA d, 3, 0, 6 ; m3=t22, m0=t21
+ UNSCRATCH 6, 8, rsp+275*mmsize ; t26a
+ UNSCRATCH 7, 9, rsp+276*mmsize ; t27
+ SCRATCH 3, 8, rsp+275*mmsize
+ SCRATCH 1, 9, rsp+276*mmsize
+ SUMSUB_BA d, 7, 2, 1 ; m7=t24a, m2=t27a
+ SUMSUB_BA d, 6, 5, 1 ; m6=t25, m5=t26
+ SUMSUB_MUL 2, 4, 1, 3, 6270, m15137 ; m2=t27, m4=t20
+ SUMSUB_MUL 5, 0, 1, 3, 6270, m15137 ; m5=t26a, m0=t21a
+
+ ; end of stage 4-5 second half
+
+ UNSCRATCH 1, 12, rsp+279*mmsize ; t28
+ UNSCRATCH 3, 13, rsp+280*mmsize ; t29a
+ SCRATCH 4, 12, rsp+279*mmsize
+ SCRATCH 0, 13, rsp+280*mmsize
+ SUMSUB_BA d, 5, 3, 0 ; m5=t29, m3=t26
+ SUMSUB_BA d, 2, 1, 0 ; m2=t28a, m1=t27a
+ UNSCRATCH 0, 14, rsp+281*mmsize ; t30
+ UNSCRATCH 4, 15, rsp+282*mmsize ; t31a
+ SCRATCH 2, 14, rsp+281*mmsize
+ SCRATCH 5, 15, rsp+282*mmsize
+ SUMSUB_BA d, 6, 0, 2 ; m6=t30a, m0=t25a
+ SUMSUB_BA d, 7, 4, 2 ; m7=t31, m4=t24
+
+ mova m2, [rsp+273*mmsize] ; t16a
+ mova m5, [rsp+274*mmsize] ; t17
+ mova [rsp+273*mmsize], m6
+ mova [rsp+274*mmsize], m7
+ UNSCRATCH 6, 10, rsp+277*mmsize ; t18a
+ UNSCRATCH 7, 11, rsp+278*mmsize ; t19
+ SCRATCH 4, 10, rsp+277*mmsize
+ SCRATCH 0, 11, rsp+278*mmsize
+ UNSCRATCH 4, 12, rsp+279*mmsize ; t20
+ UNSCRATCH 0, 13, rsp+280*mmsize ; t21a
+ SCRATCH 3, 12, rsp+279*mmsize
+ SCRATCH 1, 13, rsp+280*mmsize
+ SUMSUB_BA d, 0, 6, 1 ; m0=t18, m6=t21
+ SUMSUB_BA d, 4, 7, 1 ; m4=t19a, m7=t20a
+ UNSCRATCH 3, 8, rsp+275*mmsize ; t22
+ UNSCRATCH 1, 9, rsp+276*mmsize ; t23a
+ SCRATCH 0, 8, rsp+275*mmsize
+ SCRATCH 4, 9, rsp+276*mmsize
+ SUMSUB_BA d, 3, 5, 0 ; m3=t17a, m5=t22a
+ SUMSUB_BA d, 1, 2, 0 ; m1=t16, m2=t23
+
+ ; end of stage 6
+
+ UNSCRATCH 0, 10, rsp+277*mmsize ; t24
+ UNSCRATCH 4, 11, rsp+278*mmsize ; t25a
+ SCRATCH 1, 10, rsp+277*mmsize
+ SCRATCH 3, 11, rsp+278*mmsize
+ SUMSUB_MUL 0, 2, 1, 3, 11585, 11585 ; m0=t24a, m2=t23a
+ SUMSUB_MUL 4, 5, 1, 3, 11585, 11585 ; m4=t25, m5=t22
+ UNSCRATCH 1, 12, rsp+279*mmsize ; t26
+ UNSCRATCH 3, 13, rsp+280*mmsize ; t27a
+ SCRATCH 0, 12, rsp+279*mmsize
+ SCRATCH 4, 13, rsp+280*mmsize
+ SUMSUB_MUL 3, 7, 0, 4, 11585, 11585 ; m3=t27, m7=t20
+ SUMSUB_MUL 1, 6, 0, 4, 11585, 11585 ; m1=t26a, m6=t21a
+
+ ; end of stage 7
+
+ mova m0, [rsp+269*mmsize] ; t8
+ mova m4, [rsp+270*mmsize] ; t9a
+ mova [rsp+269*mmsize], m1 ; t26a
+ mova [rsp+270*mmsize], m3 ; t27
+ mova m3, [rsp+271*mmsize] ; t10
+ SUMSUB_BA d, 2, 0, 1 ; m2=out8, m0=out23
+ SUMSUB_BA d, 5, 4, 1 ; m5=out9, m4=out22
+ SUMSUB_BA d, 6, 3, 1 ; m6=out10, m3=out21
+ mova m1, [rsp+272*mmsize] ; t11a
+ mova [rsp+271*mmsize], m0
+ SUMSUB_BA d, 7, 1, 0 ; m7=out11, m1=out20
+
+%if %1 == 1
+ TRANSPOSE4x4D 2, 5, 6, 7, 0
+ mova [ptrq+ 2*mmsize], m2
+ mova [ptrq+10*mmsize], m5
+ mova [ptrq+18*mmsize], m6
+ mova [ptrq+26*mmsize], m7
+%else ; %1 == 2
+ pxor m0, m0
+ lea dstq, [dstq+strideq*8]
+ ROUND_AND_STORE_4x4 2, 5, 6, 7, m0, [rsp+256*mmsize], [pd_32], 6
+%endif
+ mova m2, [rsp+271*mmsize]
+%if %1 == 1
+ TRANSPOSE4x4D 1, 3, 4, 2, 0
+ mova [ptrq+ 5*mmsize], m1
+ mova [ptrq+13*mmsize], m3
+ mova [ptrq+21*mmsize], m4
+ mova [ptrq+29*mmsize], m2
+%else ; %1 == 2
+ lea dstq, [dstq+stride3q*4]
+ ROUND_AND_STORE_4x4 1, 3, 4, 2, m0, [rsp+256*mmsize], [pd_32], 6
+%endif
+
+ ; end of last stage + store for out8-11 and out20-23
+
+ UNSCRATCH 0, 9, rsp+276*mmsize ; t19a
+ UNSCRATCH 1, 8, rsp+275*mmsize ; t18
+ UNSCRATCH 2, 11, rsp+278*mmsize ; t17a
+ UNSCRATCH 3, 10, rsp+277*mmsize ; t16
+ mova m7, [rsp+261*mmsize] ; t12a
+ mova m6, [rsp+262*mmsize] ; t13
+ mova m5, [rsp+263*mmsize] ; t14a
+ SUMSUB_BA d, 0, 7, 4 ; m0=out12, m7=out19
+ SUMSUB_BA d, 1, 6, 4 ; m1=out13, m6=out18
+ SUMSUB_BA d, 2, 5, 4 ; m2=out14, m5=out17
+ mova m4, [rsp+264*mmsize] ; t15
+ SCRATCH 7, 8, rsp+275*mmsize
+ SUMSUB_BA d, 3, 4, 7 ; m3=out15, m4=out16
+
+%if %1 == 1
+ TRANSPOSE4x4D 0, 1, 2, 3, 7
+ mova [ptrq+ 3*mmsize], m0
+ mova [ptrq+11*mmsize], m1
+ mova [ptrq+19*mmsize], m2
+ mova [ptrq+27*mmsize], m3
+%else ; %1 == 2
+%if ARCH_X86_64
+ SWAP 7, 9
+ lea dstq, [dstbakq+stride3q*4]
+%else ; x86-32
+ pxor m7, m7
+ mov dstq, dstm
+ lea dstq, [dstq+stride3q*4]
+%endif
+ ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+256*mmsize], [pd_32], 6
+%endif
+ UNSCRATCH 0, 8, rsp+275*mmsize ; out19
+%if %1 == 1
+ TRANSPOSE4x4D 4, 5, 6, 0, 7
+ mova [ptrq+ 4*mmsize], m4
+ mova [ptrq+12*mmsize], m5
+ mova [ptrq+20*mmsize], m6
+ mova [ptrq+28*mmsize], m0
+%else ; %1 == 2
+ lea dstq, [dstq+strideq*4]
+ ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, [rsp+256*mmsize], [pd_32], 6
+%endif
+
+ ; end of last stage + store for out12-19
+
+%if ARCH_X86_64
+ SWAP 7, 8
+%endif
+ mova m7, [rsp+257*mmsize] ; t0
+ mova m6, [rsp+258*mmsize] ; t1
+ mova m5, [rsp+259*mmsize] ; t2
+ mova m4, [rsp+260*mmsize] ; t3
+ mova m0, [rsp+274*mmsize] ; t31
+ mova m1, [rsp+273*mmsize] ; t30a
+ UNSCRATCH 2, 15, rsp+282*mmsize ; t29
+ SUMSUB_BA d, 0, 7, 3 ; m0=out0, m7=out31
+ SUMSUB_BA d, 1, 6, 3 ; m1=out1, m6=out30
+ SUMSUB_BA d, 2, 5, 3 ; m2=out2, m5=out29
+ SCRATCH 0, 9, rsp+276*mmsize
+ UNSCRATCH 3, 14, rsp+281*mmsize ; t28a
+ SUMSUB_BA d, 3, 4, 0 ; m3=out3, m4=out28
+
+%if %1 == 1
+ TRANSPOSE4x4D 4, 5, 6, 7, 0
+ mova [ptrq+ 7*mmsize], m4
+ mova [ptrq+15*mmsize], m5
+ mova [ptrq+23*mmsize], m6
+ mova [ptrq+31*mmsize], m7
+%else ; %1 == 2
+%if ARCH_X86_64
+ SWAP 0, 8
+%else ; x86-32
+ pxor m0, m0
+%endif
+ lea dstq, [dstq+stride3q*4]
+ ROUND_AND_STORE_4x4 4, 5, 6, 7, m0, [rsp+256*mmsize], [pd_32], 6
+%endif
+ UNSCRATCH 7, 9, rsp+276*mmsize ; out0
+%if %1 == 1
+ TRANSPOSE4x4D 7, 1, 2, 3, 0
+ mova [ptrq+ 0*mmsize], m7
+ mova [ptrq+ 8*mmsize], m1
+ mova [ptrq+16*mmsize], m2
+ mova [ptrq+24*mmsize], m3
+%else ; %1 == 2
+%if ARCH_X86_64
+ DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst
+%else ; x86-32
+ mov dstq, dstm
+%endif
+ ROUND_AND_STORE_4x4 7, 1, 2, 3, m0, [rsp+256*mmsize], [pd_32], 6
+%if ARCH_X86_64
+ DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+%endif
+%endif
+
+ ; end of last stage + store for out0-3 and out28-31
+
+%if ARCH_X86_64
+ SWAP 0, 8
+%endif
+ mova m7, [rsp+265*mmsize] ; t4
+ mova m6, [rsp+266*mmsize] ; t5a
+ mova m5, [rsp+267*mmsize] ; t6a
+ mova m4, [rsp+268*mmsize] ; t7
+ mova m0, [rsp+270*mmsize] ; t27
+ mova m1, [rsp+269*mmsize] ; t26a
+ UNSCRATCH 2, 13, rsp+280*mmsize ; t25
+ SUMSUB_BA d, 0, 7, 3 ; m0=out4, m7=out27
+ SUMSUB_BA d, 1, 6, 3 ; m1=out5, m6=out26
+ SUMSUB_BA d, 2, 5, 3 ; m2=out6, m5=out25
+ UNSCRATCH 3, 12, rsp+279*mmsize ; t24a
+ SCRATCH 7, 9, rsp+276*mmsize
+ SUMSUB_BA d, 3, 4, 7 ; m3=out7, m4=out24
+
+%if %1 == 1
+ TRANSPOSE4x4D 0, 1, 2, 3, 7
+ mova [ptrq+ 1*mmsize], m0
+ mova [ptrq+ 9*mmsize], m1
+ mova [ptrq+17*mmsize], m2
+ mova [ptrq+25*mmsize], m3
+%else ; %1 == 2
+%if ARCH_X86_64
+ SWAP 7, 8
+ lea dstq, [dstbakq+strideq*4]
+%else ; x86-32
+ pxor m7, m7
+ lea dstq, [dstq+strideq*4]
+%endif
+ ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+256*mmsize], [pd_32], 6
+%endif
+ UNSCRATCH 0, 9, rsp+276*mmsize ; out27
+%if %1 == 1
+ TRANSPOSE4x4D 4, 5, 6, 0, 7
+ mova [ptrq+ 6*mmsize], m4
+ mova [ptrq+14*mmsize], m5
+ mova [ptrq+22*mmsize], m6
+ mova [ptrq+30*mmsize], m0
+%else ; %1 == 2
+%if ARCH_X86_64
+ lea dstq, [dstbakq+stride3q*8]
+%else
+ mov dstq, dstm
+ lea dstq, [dstq+stride3q*8]
+%endif
+ ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, [rsp+256*mmsize], [pd_32], 6
+%endif
+
+ ; end of last stage + store for out4-7 and out24-27
+%endmacro
+
+INIT_XMM sse2
+cglobal vp9_idct_idct_32x32_add_10, 4, 6 + ARCH_X86_64, 16, \
+ 275 * mmsize + ARCH_X86_32 * 8 * mmsize, \
+ dst, stride, block, eob
+ mova m0, [pw_1023]
+ cmp eobd, 1
+ jg .idctfull
+
+ ; dc-only - the 10bit version can be done entirely in 32bit, since the max
+ ; coef values are 17+sign bit, and the coef is 14bit, so 31+sign easily
+ ; fits in 32bit
+ DEFINE_ARGS dst, stride, block, coef
+ pxor m2, m2
+ DC_ONLY 6, m2
+ movd m1, coefd
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ DEFINE_ARGS dst, stride, cnt
+ mov cntd, 32
+.loop_dc:
+ STORE_2x8 3, 4, 1, m2, m0, dstq, mmsize
+ STORE_2x8 3, 4, 1, m2, m0, dstq+mmsize*2, mmsize
+ add dstq, strideq
+ dec cntd
+ jg .loop_dc
+ RET
+
+.idctfull:
+ mova [rsp+256*mmsize], m0
+ DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
+%if ARCH_X86_64
+ mov dstbakq, dstq
+ movsxd cntq, cntd
+%endif
+%ifdef PIC
+ lea ptrq, [default_32x32]
+ movzx cntd, byte [ptrq+cntq-1]
+%else
+ movzx cntd, byte [default_32x32+cntq-1]
+%endif
+ mov skipd, 8
+ sub skipd, cntd
+ mov ptrq, rsp
+.loop_1:
+ IDCT32_1D 1, blockq
+
+ add ptrq, 32 * mmsize
+ add blockq, mmsize
+ dec cntd
+ jg .loop_1
+
+ ; zero-pad the remainder (skipped cols)
+ test skipd, skipd
+ jz .end
+ shl skipd, 2
+ lea blockq, [blockq+skipq*(mmsize/4)]
+ pxor m0, m0
+.loop_z:
+ mova [ptrq+mmsize*0], m0
+ mova [ptrq+mmsize*1], m0
+ mova [ptrq+mmsize*2], m0
+ mova [ptrq+mmsize*3], m0
+ mova [ptrq+mmsize*4], m0
+ mova [ptrq+mmsize*5], m0
+ mova [ptrq+mmsize*6], m0
+ mova [ptrq+mmsize*7], m0
+ add ptrq, 8 * mmsize
+ dec skipd
+ jg .loop_z
+.end:
+
+ DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+ lea stride3q, [strideq*3]
+ mov cntd, 8
+ mov ptrq, rsp
+.loop_2:
+ IDCT32_1D 2, ptrq
+
+ add ptrq, mmsize
+%if ARCH_X86_64
+ add dstbakq, 8
+ mov dstq, dstbakq
+%else
+ add dword dstm, 8
+ mov dstq, dstm
+%endif
+ dec cntd
+ jg .loop_2
+
+ ; m7 is still zero
+ ZERO_BLOCK blockq-8*mmsize, 128, 32, m7
+ RET
+
+INIT_XMM sse2
+cglobal vp9_idct_idct_32x32_add_12, 4, 6 + ARCH_X86_64, 16, \
+ 275 * mmsize + ARCH_X86_32 * 8 * mmsize, \
+ dst, stride, block, eob
+ mova m0, [pw_4095]
+ cmp eobd, 1
+ jg mangle(private_prefix %+ _ %+ vp9_idct_idct_32x32_add_10 %+ SUFFIX).idctfull
+
+ ; dc-only - unfortunately, this one can overflow, since coefs are 19+sign
+ ; bpp, and 19+14+sign does not fit in 32bit, so we do 2-stage multiplies
+ DEFINE_ARGS dst, stride, block, coef, coefl
+ pxor m2, m2
+ DC_ONLY_64BIT 6, m2
+ movd m1, coefd
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ DEFINE_ARGS dst, stride, cnt
+ mov cntd, 32
+.loop_dc:
+ STORE_2x8 3, 4, 1, m2, m0, dstq, mmsize
+ STORE_2x8 3, 4, 1, m2, m0, dstq+mmsize*2, mmsize
+ add dstq, strideq
+ dec cntd
+ jg .loop_dc
+ RET
diff --git a/libs/ffvpx/libavcodec/x86/vp9itxfm_template.asm b/libs/ffvpx/libavcodec/x86/vp9itxfm_template.asm
new file mode 100644
index 000000000..d2f2257d8
--- /dev/null
+++ b/libs/ffvpx/libavcodec/x86/vp9itxfm_template.asm
@@ -0,0 +1,142 @@
+;******************************************************************************
+;* VP9 IDCT SIMD optimizations
+;*
+;* Copyright (C) 2013 Clément Bœsch <u pkh me>
+;* Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%macro VP9_IWHT4_1D 0
+ SWAP 1, 2, 3
+ paddw m0, m2
+ psubw m3, m1
+ psubw m4, m0, m3
+ psraw m4, 1
+ psubw m5, m4, m1
+ SWAP 5, 1
+ psubw m4, m2
+ SWAP 4, 2
+ psubw m0, m1
+ paddw m3, m2
+ SWAP 3, 2, 1
+%endmacro
+
+; (a*x + b*y + round) >> shift
+%macro VP9_MULSUB_2W_2X 5 ; dst1, dst2/src, round, coefs1, coefs2
+ pmaddwd m%1, m%2, %4
+ pmaddwd m%2, %5
+ paddd m%1, %3
+ paddd m%2, %3
+ psrad m%1, 14
+ psrad m%2, 14
+%endmacro
+
+%macro VP9_MULSUB_2W_4X 7 ; dst1, dst2, coef1, coef2, rnd, tmp1/src, tmp2
+ VP9_MULSUB_2W_2X %7, %6, %5, [pw_m%3_%4], [pw_%4_%3]
+ VP9_MULSUB_2W_2X %1, %2, %5, [pw_m%3_%4], [pw_%4_%3]
+ packssdw m%1, m%7
+ packssdw m%2, m%6
+%endmacro
+
+%macro VP9_UNPACK_MULSUB_2W_4X 7-9 ; dst1, dst2, (src1, src2,) coef1, coef2, rnd, tmp1, tmp2
+%if %0 == 7
+ punpckhwd m%6, m%2, m%1
+ punpcklwd m%2, m%1
+ VP9_MULSUB_2W_4X %1, %2, %3, %4, %5, %6, %7
+%else
+ punpckhwd m%8, m%4, m%3
+ punpcklwd m%2, m%4, m%3
+ VP9_MULSUB_2W_4X %1, %2, %5, %6, %7, %8, %9
+%endif
+%endmacro
+
+%macro VP9_IDCT4_1D_FINALIZE 0
+ SUMSUB_BA w, 3, 2, 4 ; m3=t3+t0, m2=-t3+t0
+ SUMSUB_BA w, 1, 0, 4 ; m1=t2+t1, m0=-t2+t1
+ SWAP 0, 3, 2 ; 3102 -> 0123
+%endmacro
+
+%macro VP9_IDCT4_1D 0
+%if cpuflag(ssse3)
+ SUMSUB_BA w, 2, 0, 4 ; m2=IN(0)+IN(2) m0=IN(0)-IN(2)
+ pmulhrsw m2, m6 ; m2=t0
+ pmulhrsw m0, m6 ; m0=t1
+%else ; <= sse2
+ VP9_UNPACK_MULSUB_2W_4X 0, 2, 11585, 11585, m7, 4, 5 ; m0=t1, m1=t0
+%endif
+ VP9_UNPACK_MULSUB_2W_4X 1, 3, 15137, 6270, m7, 4, 5 ; m1=t2, m3=t3
+ VP9_IDCT4_1D_FINALIZE
+%endmacro
+
+%macro VP9_IADST4_1D 0
+ movq2dq xmm0, m0
+ movq2dq xmm1, m1
+ movq2dq xmm2, m2
+ movq2dq xmm3, m3
+%if cpuflag(ssse3)
+ paddw m3, m0
+%endif
+ punpcklwd xmm0, xmm1
+ punpcklwd xmm2, xmm3
+ pmaddwd xmm1, xmm0, [pw_5283_13377]
+ pmaddwd xmm4, xmm0, [pw_9929_13377]
+%if notcpuflag(ssse3)
+ pmaddwd xmm6, xmm0, [pw_13377_0]
+%endif
+ pmaddwd xmm0, [pw_15212_m13377]
+ pmaddwd xmm3, xmm2, [pw_15212_9929]
+%if notcpuflag(ssse3)
+ pmaddwd xmm7, xmm2, [pw_m13377_13377]
+%endif
+ pmaddwd xmm2, [pw_m5283_m15212]
+%if cpuflag(ssse3)
+ psubw m3, m2
+%else
+ paddd xmm6, xmm7
+%endif
+ paddd xmm0, xmm2
+ paddd xmm3, xmm5
+ paddd xmm2, xmm5
+%if notcpuflag(ssse3)
+ paddd xmm6, xmm5
+%endif
+ paddd xmm1, xmm3
+ paddd xmm0, xmm3
+ paddd xmm4, xmm2
+ psrad xmm1, 14
+ psrad xmm0, 14
+ psrad xmm4, 14
+%if cpuflag(ssse3)
+ pmulhrsw m3, [pw_13377x2] ; out2
+%else
+ psrad xmm6, 14
+%endif
+ packssdw xmm0, xmm0
+ packssdw xmm1, xmm1
+ packssdw xmm4, xmm4
+%if notcpuflag(ssse3)
+ packssdw xmm6, xmm6
+%endif
+ movdq2q m0, xmm0 ; out3
+ movdq2q m1, xmm1 ; out0
+ movdq2q m2, xmm4 ; out1
+%if notcpuflag(ssse3)
+ movdq2q m3, xmm6 ; out2
+%endif
+ SWAP 0, 1, 2, 3
+%endmacro
diff --git a/libs/ffvpx/libavcodec/x86/vp9lpf.asm b/libs/ffvpx/libavcodec/x86/vp9lpf.asm
new file mode 100644
index 000000000..4e7ede223
--- /dev/null
+++ b/libs/ffvpx/libavcodec/x86/vp9lpf.asm
@@ -0,0 +1,1211 @@
+;******************************************************************************
+;* VP9 loop filter SIMD optimizations
+;*
+;* Copyright (C) 2013-2014 Clément Bœsch <u pkh me>
+;* Copyright (C) 2014 Ronald S. Bultje <rsbultje@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+cextern pb_3
+cextern pb_80
+
+pb_4: times 16 db 0x04
+pb_10: times 16 db 0x10
+pb_40: times 16 db 0x40
+pb_81: times 16 db 0x81
+pb_f8: times 16 db 0xf8
+pb_fe: times 16 db 0xfe
+pb_ff: times 16 db 0xff
+
+cextern pw_4
+cextern pw_8
+
+; with mix functions, two 8-bit thresholds are stored in a 16-bit storage,
+; the following mask is used to splat both in the same register
+mask_mix: times 8 db 0
+ times 8 db 1
+
+mask_mix84: times 8 db 0xff
+ times 8 db 0x00
+mask_mix48: times 8 db 0x00
+ times 8 db 0xff
+
+SECTION .text
+
+%macro SCRATCH 3
+%ifdef m8
+ SWAP %1, %2
+%else
+ mova [%3], m%1
+%endif
+%endmacro
+
+%macro UNSCRATCH 3
+%ifdef m8
+ SWAP %1, %2
+%else
+ mova m%1, [%3]
+%endif
+%endmacro
+
+; %1 = abs(%2-%3)
+%macro ABSSUB 4 ; dst, src1 (RO), src2 (RO), tmp
+%ifdef m8
+ psubusb %1, %3, %2
+ psubusb %4, %2, %3
+%else
+ mova %1, %3
+ mova %4, %2
+ psubusb %1, %2
+ psubusb %4, %3
+%endif
+ por %1, %4
+%endmacro
+
+; %1 = %1>%2
+%macro CMP_GT 2-3 ; src/dst, cmp, pb_80
+%if %0 == 3
+ pxor %1, %3
+%endif
+ pcmpgtb %1, %2
+%endmacro
+
+; %1 = abs(%2-%3) > %4
+%macro ABSSUB_GT 5-6 [pb_80]; dst, src1, src2, cmp, tmp, [pb_80]
+ ABSSUB %1, %2, %3, %5 ; dst = abs(src1-src2)
+ CMP_GT %1, %4, %6 ; dst > cmp
+%endmacro
+
+%macro MASK_APPLY 4 ; %1=new_data/dst %2=old_data %3=mask %4=tmp
+ pand %1, %3 ; new &= mask
+ pandn %4, %3, %2 ; tmp = ~mask & old
+ por %1, %4 ; new&mask | old&~mask
+%endmacro
+
+%macro UNPACK 4
+%ifdef m8
+ punpck%1bw %2, %3, %4
+%else
+ mova %2, %3
+ punpck%1bw %2, %4
+%endif
+%endmacro
+
+%macro FILTER_SUBx2_ADDx2 11 ; %1=dst %2=h/l %3=cache %4=stack_off %5=sub1 %6=sub2 %7=add1
+ ; %8=add2 %9=rshift, [unpack], [unpack_is_mem_on_x86_32]
+ psubw %3, [rsp+%4+%5*mmsize*2]
+ psubw %3, [rsp+%4+%6*mmsize*2]
+ paddw %3, [rsp+%4+%7*mmsize*2]
+%ifnidn %10, ""
+%if %11 == 0
+ punpck%2bw %1, %10, m0
+%else
+ UNPACK %2, %1, %10, m0
+%endif
+ mova [rsp+%4+%8*mmsize*2], %1
+ paddw %3, %1
+%else
+ paddw %3, [rsp+%4+%8*mmsize*2]
+%endif
+ psraw %1, %3, %9
+%endmacro
+
+; FIXME interleave l/h better (for instruction pairing)
+%macro FILTER_INIT 9 ; tmp1, tmp2, cacheL, cacheH, dstp, stack_off, filterid, mask, source
+ FILTER%7_INIT %1, l, %3, %6 + 0
+ FILTER%7_INIT %2, h, %4, %6 + mmsize
+ packuswb %1, %2
+ MASK_APPLY %1, %9, %8, %2
+ mova %5, %1
+%endmacro
+
+
+%macro FILTER_UPDATE 12-16 "", "", "", 0 ; tmp1, tmp2, cacheL, cacheH, dstp, stack_off, -, -, +, +, rshift,
+ ; mask, [source], [unpack + src], [unpack_is_mem_on_x86_32]
+; FIXME interleave this properly with the subx2/addx2
+%ifnidn %15, ""
+%if %16 == 0 || ARCH_X86_64
+ mova %14, %15
+%endif
+%endif
+ FILTER_SUBx2_ADDx2 %1, l, %3, %6 + 0, %7, %8, %9, %10, %11, %14, %16
+ FILTER_SUBx2_ADDx2 %2, h, %4, %6 + mmsize, %7, %8, %9, %10, %11, %14, %16
+ packuswb %1, %2
+%ifnidn %13, ""
+ MASK_APPLY %1, %13, %12, %2
+%else
+ MASK_APPLY %1, %5, %12, %2
+%endif
+ mova %5, %1
+%endmacro
+
+%macro SRSHIFT3B_2X 4 ; reg1, reg2, [pb_10], tmp
+ mova %4, [pb_f8]
+ pand %1, %4
+ pand %2, %4
+ psrlq %1, 3
+ psrlq %2, 3
+ pxor %1, %3
+ pxor %2, %3
+ psubb %1, %3
+ psubb %2, %3
+%endmacro
+
+%macro EXTRACT_POS_NEG 3 ; i8, neg, pos
+ pxor %3, %3
+ pxor %2, %2
+ pcmpgtb %3, %1 ; i8 < 0 mask
+ psubb %2, %1 ; neg values (only the originally - will be kept)
+ pand %2, %3 ; negative values of i8 (but stored as +)
+ pandn %3, %1 ; positive values of i8
+%endmacro
+
+; clip_u8(u8 + i8)
+%macro SIGN_ADD 4 ; dst, u8, i8, tmp1
+ EXTRACT_POS_NEG %3, %4, %1
+ paddusb %1, %2 ; add the positives
+ psubusb %1, %4 ; sub the negatives
+%endmacro
+
+; clip_u8(u8 - i8)
+%macro SIGN_SUB 4 ; dst, u8, i8, tmp1
+ EXTRACT_POS_NEG %3, %1, %4
+ paddusb %1, %2 ; add the negatives
+ psubusb %1, %4 ; sub the positives
+%endmacro
+
+%macro FILTER6_INIT 4 ; %1=dst %2=h/l %3=cache, %4=stack_off
+ UNPACK %2, %1, rp3, m0 ; p3: B->W
+ mova [rsp+%4+0*mmsize*2], %1
+ paddw %3, %1, %1 ; p3*2
+ paddw %3, %1 ; p3*3
+ punpck%2bw %1, m1, m0 ; p2: B->W
+ mova [rsp+%4+1*mmsize*2], %1
+ paddw %3, %1 ; p3*3 + p2
+ paddw %3, %1 ; p3*3 + p2*2
+ UNPACK %2, %1, rp1, m0 ; p1: B->W
+ mova [rsp+%4+2*mmsize*2], %1
+ paddw %3, %1 ; p3*3 + p2*2 + p1
+ UNPACK %2, %1, rp0, m0 ; p0: B->W
+ mova [rsp+%4+3*mmsize*2], %1
+ paddw %3, %1 ; p3*3 + p2*2 + p1 + p0
+ UNPACK %2, %1, rq0, m0 ; q0: B->W
+ mova [rsp+%4+4*mmsize*2], %1
+ paddw %3, %1 ; p3*3 + p2*2 + p1 + p0 + q0
+ paddw %3, [pw_4] ; p3*3 + p2*2 + p1 + p0 + q0 + 4
+ psraw %1, %3, 3 ; (p3*3 + p2*2 + p1 + p0 + q0 + 4) >> 3
+%endmacro
+
+%macro FILTER14_INIT 4 ; %1=dst %2=h/l %3=cache, %4=stack_off
+ punpck%2bw %1, m2, m0 ; p7: B->W
+ mova [rsp+%4+ 8*mmsize*2], %1
+ psllw %3, %1, 3 ; p7*8
+ psubw %3, %1 ; p7*7
+ punpck%2bw %1, m3, m0 ; p6: B->W
+ mova [rsp+%4+ 9*mmsize*2], %1
+ paddw %3, %1 ; p7*7 + p6
+ paddw %3, %1 ; p7*7 + p6*2
+ UNPACK %2, %1, rp5, m0 ; p5: B->W
+ mova [rsp+%4+10*mmsize*2], %1
+ paddw %3, %1 ; p7*7 + p6*2 + p5
+ UNPACK %2, %1, rp4, m0 ; p4: B->W
+ mova [rsp+%4+11*mmsize*2], %1
+ paddw %3, %1 ; p7*7 + p6*2 + p5 + p4
+ paddw %3, [rsp+%4+ 0*mmsize*2] ; p7*7 + p6*2 + p5 + p4 + p3
+ paddw %3, [rsp+%4+ 1*mmsize*2] ; p7*7 + p6*2 + p5 + .. + p2
+ paddw %3, [rsp+%4+ 2*mmsize*2] ; p7*7 + p6*2 + p5 + .. + p1
+ paddw %3, [rsp+%4+ 3*mmsize*2] ; p7*7 + p6*2 + p5 + .. + p0
+ paddw %3, [rsp+%4+ 4*mmsize*2] ; p7*7 + p6*2 + p5 + .. + p0 + q0
+ paddw %3, [pw_8] ; p7*7 + p6*2 + p5 + .. + p0 + q0 + 8
+ psraw %1, %3, 4 ; (p7*7 + p6*2 + p5 + .. + p0 + q0 + 8) >> 4
+%endmacro
+
+%macro TRANSPOSE16x16B 17
+ mova %17, m%16
+ SBUTTERFLY bw, %1, %2, %16
+ SBUTTERFLY bw, %3, %4, %16
+ SBUTTERFLY bw, %5, %6, %16
+ SBUTTERFLY bw, %7, %8, %16
+ SBUTTERFLY bw, %9, %10, %16
+ SBUTTERFLY bw, %11, %12, %16
+ SBUTTERFLY bw, %13, %14, %16
+ mova m%16, %17
+ mova %17, m%14
+ SBUTTERFLY bw, %15, %16, %14
+ SBUTTERFLY wd, %1, %3, %14
+ SBUTTERFLY wd, %2, %4, %14
+ SBUTTERFLY wd, %5, %7, %14
+ SBUTTERFLY wd, %6, %8, %14
+ SBUTTERFLY wd, %9, %11, %14
+ SBUTTERFLY wd, %10, %12, %14
+ SBUTTERFLY wd, %13, %15, %14
+ mova m%14, %17
+ mova %17, m%12
+ SBUTTERFLY wd, %14, %16, %12
+ SBUTTERFLY dq, %1, %5, %12
+ SBUTTERFLY dq, %2, %6, %12
+ SBUTTERFLY dq, %3, %7, %12
+ SBUTTERFLY dq, %4, %8, %12
+ SBUTTERFLY dq, %9, %13, %12
+ SBUTTERFLY dq, %10, %14, %12
+ SBUTTERFLY dq, %11, %15, %12
+ mova m%12, %17
+ mova %17, m%8
+ SBUTTERFLY dq, %12, %16, %8
+ SBUTTERFLY qdq, %1, %9, %8
+ SBUTTERFLY qdq, %2, %10, %8
+ SBUTTERFLY qdq, %3, %11, %8
+ SBUTTERFLY qdq, %4, %12, %8
+ SBUTTERFLY qdq, %5, %13, %8
+ SBUTTERFLY qdq, %6, %14, %8
+ SBUTTERFLY qdq, %7, %15, %8
+ mova m%8, %17
+ mova %17, m%1
+ SBUTTERFLY qdq, %8, %16, %1
+ mova m%1, %17
+ SWAP %2, %9
+ SWAP %3, %5
+ SWAP %4, %13
+ SWAP %6, %11
+ SWAP %8, %15
+ SWAP %12, %14
+%endmacro
+
+%macro TRANSPOSE8x8B 13
+ SBUTTERFLY bw, %1, %2, %7
+ movdq%10 m%7, %9
+ movdqa %11, m%2
+ SBUTTERFLY bw, %3, %4, %2
+ SBUTTERFLY bw, %5, %6, %2
+ SBUTTERFLY bw, %7, %8, %2
+ SBUTTERFLY wd, %1, %3, %2
+ movdqa m%2, %11
+ movdqa %11, m%3
+ SBUTTERFLY wd, %2, %4, %3
+ SBUTTERFLY wd, %5, %7, %3
+ SBUTTERFLY wd, %6, %8, %3
+ SBUTTERFLY dq, %1, %5, %3
+ SBUTTERFLY dq, %2, %6, %3
+ movdqa m%3, %11
+ movh %12, m%2
+ movhps %13, m%2
+ SBUTTERFLY dq, %3, %7, %2
+ SBUTTERFLY dq, %4, %8, %2
+ SWAP %2, %5
+ SWAP %4, %7
+%endmacro
+
+%macro DEFINE_REAL_P7_TO_Q7 0-1 0
+%define P7 dstq + 4*mstrideq + %1
+%define P6 dstq + mstride3q + %1
+%define P5 dstq + 2*mstrideq + %1
+%define P4 dstq + mstrideq + %1
+%define P3 dstq + %1
+%define P2 dstq + strideq + %1
+%define P1 dstq + 2* strideq + %1
+%define P0 dstq + stride3q + %1
+%define Q0 dstq + 4* strideq + %1
+%define Q1 dst2q + mstride3q + %1
+%define Q2 dst2q + 2*mstrideq + %1
+%define Q3 dst2q + mstrideq + %1
+%define Q4 dst2q + %1
+%define Q5 dst2q + strideq + %1
+%define Q6 dst2q + 2* strideq + %1
+%define Q7 dst2q + stride3q + %1
+%endmacro
+
+%macro DEFINE_TRANSPOSED_P7_TO_Q7 0-1 0
+%define P3 rsp + 0*mmsize + %1
+%define P2 rsp + 1*mmsize + %1
+%define P1 rsp + 2*mmsize + %1
+%define P0 rsp + 3*mmsize + %1
+%define Q0 rsp + 4*mmsize + %1
+%define Q1 rsp + 5*mmsize + %1
+%define Q2 rsp + 6*mmsize + %1
+%define Q3 rsp + 7*mmsize + %1
+%if mmsize == 16
+%define P7 rsp + 8*mmsize + %1
+%define P6 rsp + 9*mmsize + %1
+%define P5 rsp + 10*mmsize + %1
+%define P4 rsp + 11*mmsize + %1
+%define Q4 rsp + 12*mmsize + %1
+%define Q5 rsp + 13*mmsize + %1
+%define Q6 rsp + 14*mmsize + %1
+%define Q7 rsp + 15*mmsize + %1
+%endif
+%endmacro
+
+; ..............AB -> AAAAAAAABBBBBBBB
+%macro SPLATB_MIX 1-2 [mask_mix]
+%if cpuflag(ssse3)
+ pshufb %1, %2
+%else
+ punpcklbw %1, %1
+ punpcklwd %1, %1
+ punpckldq %1, %1
+%endif
+%endmacro
+
+%macro LOOPFILTER 5 ; %1=v/h %2=size1 %3+%4=stack, %5=mmx/32bit stack only
+%assign %%ext 0
+%if ARCH_X86_32 || mmsize == 8
+%assign %%ext %5
+%endif
+
+%if UNIX64
+cglobal vp9_loop_filter_%1_%2_ %+ mmsize, 5, 9, 16, %3 + %4 + %%ext, dst, stride, E, I, H, mstride, dst2, stride3, mstride3
+%else
+%if WIN64
+cglobal vp9_loop_filter_%1_%2_ %+ mmsize, 4, 8, 16, %3 + %4 + %%ext, dst, stride, E, I, mstride, dst2, stride3, mstride3
+%else
+cglobal vp9_loop_filter_%1_%2_ %+ mmsize, 2, 6, 16, %3 + %4 + %%ext, dst, stride, mstride, dst2, stride3, mstride3
+%define Ed dword r2m
+%define Id dword r3m
+%endif
+%define Hd dword r4m
+%endif
+
+ mov mstrideq, strideq
+ neg mstrideq
+
+ lea stride3q, [strideq*3]
+ lea mstride3q, [mstrideq*3]
+
+%ifidn %1, h
+%if %2 != 16
+%if mmsize == 16
+%define movx movh
+%else
+%define movx mova
+%endif
+ lea dstq, [dstq + 4*strideq - 4]
+%else
+%define movx movu
+ lea dstq, [dstq + 4*strideq - 8] ; go from top center (h pos) to center left (v pos)
+%endif
+%else
+ lea dstq, [dstq + 4*mstrideq]
+%endif
+ ; FIXME we shouldn't need two dts registers if mmsize == 8
+ lea dst2q, [dstq + 8*strideq]
+
+ DEFINE_REAL_P7_TO_Q7
+
+%ifidn %1, h
+ movx m0, [P7]
+ movx m1, [P6]
+ movx m2, [P5]
+ movx m3, [P4]
+ movx m4, [P3]
+ movx m5, [P2]
+%if (ARCH_X86_64 && mmsize == 16) || %2 > 16
+ movx m6, [P1]
+%endif
+ movx m7, [P0]
+%ifdef m8
+ movx m8, [Q0]
+ movx m9, [Q1]
+ movx m10, [Q2]
+ movx m11, [Q3]
+ movx m12, [Q4]
+ movx m13, [Q5]
+ movx m14, [Q6]
+ movx m15, [Q7]
+ DEFINE_TRANSPOSED_P7_TO_Q7
+%if %2 == 16
+ TRANSPOSE16x16B 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, [rsp]
+ mova [P7], m0
+ mova [P6], m1
+ mova [P5], m2
+ mova [P4], m3
+%else ; %2 == 44/48/84/88
+ ; 8x16 transpose
+ punpcklbw m0, m1
+ punpcklbw m2, m3
+ punpcklbw m4, m5
+ punpcklbw m6, m7
+ punpcklbw m8, m9
+ punpcklbw m10, m11
+ punpcklbw m12, m13
+ punpcklbw m14, m15
+ TRANSPOSE8x8W 0, 2, 4, 6, 8, 10, 12, 14, 15
+ SWAP 0, 4
+ SWAP 2, 5
+ SWAP 0, 6
+ SWAP 0, 7
+ SWAP 10, 9
+ SWAP 12, 10
+ SWAP 14, 11
+%endif ; %2
+ mova [P3], m4
+ mova [P2], m5
+ mova [P1], m6
+ mova [P0], m7
+ mova [Q0], m8
+ mova [Q1], m9
+ mova [Q2], m10
+ mova [Q3], m11
+%if %2 == 16
+ mova [Q4], m12
+ mova [Q5], m13
+ mova [Q6], m14
+ mova [Q7], m15
+%endif ; %2
+%else ; x86-32
+%if %2 == 16
+ TRANSPOSE8x8B 0, 1, 2, 3, 4, 5, 6, 7, [P1], u, [rsp+%3+%4], [rsp+64], [rsp+80]
+ DEFINE_TRANSPOSED_P7_TO_Q7
+ movh [P7], m0
+ movh [P5], m1
+ movh [P3], m2
+ movh [P1], m3
+ movh [Q2], m5
+ movh [Q4], m6
+ movh [Q6], m7
+ movhps [P6], m0
+ movhps [P4], m1
+ movhps [P2], m2
+ movhps [P0], m3
+ movhps [Q3], m5
+ movhps [Q5], m6
+ movhps [Q7], m7
+ DEFINE_REAL_P7_TO_Q7
+ movx m0, [Q0]
+ movx m1, [Q1]
+ movx m2, [Q2]
+ movx m3, [Q3]
+ movx m4, [Q4]
+ movx m5, [Q5]
+ movx m7, [Q7]
+ TRANSPOSE8x8B 0, 1, 2, 3, 4, 5, 6, 7, [Q6], u, [rsp+%3+%4], [rsp+72], [rsp+88]
+ DEFINE_TRANSPOSED_P7_TO_Q7 8
+ movh [P7], m0
+ movh [P5], m1
+ movh [P3], m2
+ movh [P1], m3
+ movh [Q2], m5
+ movh [Q4], m6
+ movh [Q6], m7
+ movhps [P6], m0
+ movhps [P4], m1
+ movhps [P2], m2
+ movhps [P0], m3
+ movhps [Q3], m5
+ movhps [Q5], m6
+ movhps [Q7], m7
+ DEFINE_TRANSPOSED_P7_TO_Q7
+%elif %2 > 16 ; %2 == 44/48/84/88
+ punpcklbw m0, m1
+ punpcklbw m2, m3
+ punpcklbw m4, m5
+ punpcklbw m6, m7
+ movx m1, [Q0]
+ movx m3, [Q1]
+ movx m5, [Q2]
+ movx m7, [Q3]
+ punpcklbw m1, m3
+ punpcklbw m5, m7
+ movx m3, [Q4]
+ movx m7, [Q5]
+ punpcklbw m3, m7
+ mova [rsp], m3
+ movx m3, [Q6]
+ movx m7, [Q7]
+ punpcklbw m3, m7
+ DEFINE_TRANSPOSED_P7_TO_Q7
+ TRANSPOSE8x8W 0, 2, 4, 6, 1, 5, 7, 3, [rsp], [Q0], 1
+ mova [P3], m0
+ mova [P2], m2
+ mova [P1], m4
+ mova [P0], m6
+ mova [Q1], m5
+ mova [Q2], m7
+ mova [Q3], m3
+%else ; %2 == 4 || %2 == 8
+ SBUTTERFLY bw, 0, 1, 6
+ SBUTTERFLY bw, 2, 3, 6
+ SBUTTERFLY bw, 4, 5, 6
+ mova [rsp+4*mmsize], m5
+ mova m6, [P1]
+ SBUTTERFLY bw, 6, 7, 5
+ DEFINE_TRANSPOSED_P7_TO_Q7
+ TRANSPOSE4x4W 0, 2, 4, 6, 5
+ mova [P3], m0
+ mova [P2], m2
+ mova [P1], m4
+ mova [P0], m6
+ mova m5, [rsp+4*mmsize]
+ TRANSPOSE4x4W 1, 3, 5, 7, 0
+ mova [Q0], m1
+ mova [Q1], m3
+ mova [Q2], m5
+ mova [Q3], m7
+%endif ; %2
+%endif ; x86-32/64
+%endif ; %1 == h
+
+ ; calc fm mask
+%if %2 == 16 || mmsize == 8
+%if cpuflag(ssse3)
+ pxor m0, m0
+%endif
+ SPLATB_REG m2, I, m0 ; I I I I ...
+ SPLATB_REG m3, E, m0 ; E E E E ...
+%else
+%if cpuflag(ssse3)
+ mova m0, [mask_mix]
+%endif
+ movd m2, Id
+ movd m3, Ed
+ SPLATB_MIX m2, m0
+ SPLATB_MIX m3, m0
+%endif
+ mova m0, [pb_80]
+ pxor m2, m0
+ pxor m3, m0
+%ifdef m8
+%ifidn %1, v
+ mova m8, [P3]
+ mova m9, [P2]
+ mova m10, [P1]
+ mova m11, [P0]
+ mova m12, [Q0]
+ mova m13, [Q1]
+ mova m14, [Q2]
+ mova m15, [Q3]
+%else
+ ; In case of horizontal, P3..Q3 are already present in some registers due
+ ; to the previous transpose, so we just swap registers.
+ SWAP 8, 4, 12
+ SWAP 9, 5, 13
+ SWAP 10, 6, 14
+ SWAP 11, 7, 15
+%endif
+%define rp3 m8
+%define rp2 m9
+%define rp1 m10
+%define rp0 m11
+%define rq0 m12
+%define rq1 m13
+%define rq2 m14
+%define rq3 m15
+%else
+%define rp3 [P3]
+%define rp2 [P2]
+%define rp1 [P1]
+%define rp0 [P0]
+%define rq0 [Q0]
+%define rq1 [Q1]
+%define rq2 [Q2]
+%define rq3 [Q3]
+%endif
+ ABSSUB_GT m5, rp3, rp2, m2, m7, m0 ; m5 = abs(p3-p2) <= I
+ ABSSUB_GT m1, rp2, rp1, m2, m7, m0 ; m1 = abs(p2-p1) <= I
+ por m5, m1
+ ABSSUB_GT m1, rp1, rp0, m2, m7, m0 ; m1 = abs(p1-p0) <= I
+ por m5, m1
+ ABSSUB_GT m1, rq0, rq1, m2, m7, m0 ; m1 = abs(q1-q0) <= I
+ por m5, m1
+ ABSSUB_GT m1, rq1, rq2, m2, m7, m0 ; m1 = abs(q2-q1) <= I
+ por m5, m1
+ ABSSUB_GT m1, rq2, rq3, m2, m7, m0 ; m1 = abs(q3-q2) <= I
+ por m5, m1
+ ABSSUB m1, rp0, rq0, m7 ; abs(p0-q0)
+ paddusb m1, m1 ; abs(p0-q0) * 2
+ ABSSUB m2, rp1, rq1, m7 ; abs(p1-q1)
+ pand m2, [pb_fe] ; drop lsb so shift can work
+ psrlq m2, 1 ; abs(p1-q1)/2
+ paddusb m1, m2 ; abs(p0-q0)*2 + abs(p1-q1)/2
+ pxor m1, m0
+ pcmpgtb m1, m3
+ por m1, m5 ; fm final value
+ SWAP 1, 3
+ pxor m3, [pb_ff]
+
+ ; (m3: fm, m8..15: p3 p2 p1 p0 q0 q1 q2 q3)
+ ; calc flat8in (if not 44_16) and hev masks
+%if %2 != 44 && %2 != 4
+ mova m6, [pb_81] ; [1 1 1 1 ...] ^ 0x80
+ ABSSUB_GT m2, rp3, rp0, m6, m5 ; abs(p3 - p0) <= 1
+%ifdef m8
+ mova m8, [pb_80]
+%define rb80 m8
+%else
+%define rb80 [pb_80]
+%endif
+ ABSSUB_GT m1, rp2, rp0, m6, m5, rb80 ; abs(p2 - p0) <= 1
+ por m2, m1
+ ABSSUB m4, rp1, rp0, m5 ; abs(p1 - p0)
+%if %2 <= 16
+%if cpuflag(ssse3)
+ pxor m0, m0
+%endif
+ SPLATB_REG m7, H, m0 ; H H H H ...
+%else
+ movd m7, Hd
+ SPLATB_MIX m7
+%endif
+ pxor m7, rb80
+ pxor m4, rb80
+ pcmpgtb m0, m4, m7 ; abs(p1 - p0) > H (1/2 hev condition)
+ CMP_GT m4, m6 ; abs(p1 - p0) <= 1
+ por m2, m4 ; (flat8in)
+ ABSSUB m4, rq1, rq0, m1 ; abs(q1 - q0)
+ pxor m4, rb80
+ pcmpgtb m5, m4, m7 ; abs(q1 - q0) > H (2/2 hev condition)
+ por m0, m5 ; hev final value
+ CMP_GT m4, m6 ; abs(q1 - q0) <= 1
+ por m2, m4 ; (flat8in)
+ ABSSUB_GT m1, rq2, rq0, m6, m5, rb80 ; abs(q2 - q0) <= 1
+ por m2, m1
+ ABSSUB_GT m1, rq3, rq0, m6, m5, rb80 ; abs(q3 - q0) <= 1
+ por m2, m1 ; flat8in final value
+ pxor m2, [pb_ff]
+%if %2 == 84 || %2 == 48
+ pand m2, [mask_mix%2]
+%endif
+%else
+ mova m6, [pb_80]
+%if %2 == 44
+ movd m7, Hd
+ SPLATB_MIX m7
+%else
+%if cpuflag(ssse3)
+ pxor m0, m0
+%endif
+ SPLATB_REG m7, H, m0 ; H H H H ...
+%endif
+ pxor m7, m6
+ ABSSUB m4, rp1, rp0, m1 ; abs(p1 - p0)
+ pxor m4, m6
+ pcmpgtb m0, m4, m7 ; abs(p1 - p0) > H (1/2 hev condition)
+ ABSSUB m4, rq1, rq0, m1 ; abs(q1 - q0)
+ pxor m4, m6
+ pcmpgtb m5, m4, m7 ; abs(q1 - q0) > H (2/2 hev condition)
+ por m0, m5 ; hev final value
+%endif
+
+%if %2 == 16
+ ; (m0: hev, m2: flat8in, m3: fm, m6: pb_81, m9..15: p2 p1 p0 q0 q1 q2 q3)
+ ; calc flat8out mask
+%ifdef m8
+ mova m8, [P7]
+ mova m9, [P6]
+%define rp7 m8
+%define rp6 m9
+%else
+%define rp7 [P7]
+%define rp6 [P6]
+%endif
+ ABSSUB_GT m1, rp7, rp0, m6, m5 ; abs(p7 - p0) <= 1
+ ABSSUB_GT m7, rp6, rp0, m6, m5 ; abs(p6 - p0) <= 1
+ por m1, m7
+%ifdef m8
+ mova m8, [P5]
+ mova m9, [P4]
+%define rp5 m8
+%define rp4 m9
+%else
+%define rp5 [P5]
+%define rp4 [P4]
+%endif
+ ABSSUB_GT m7, rp5, rp0, m6, m5 ; abs(p5 - p0) <= 1
+ por m1, m7
+ ABSSUB_GT m7, rp4, rp0, m6, m5 ; abs(p4 - p0) <= 1
+ por m1, m7
+%ifdef m8
+ mova m14, [Q4]
+ mova m15, [Q5]
+%define rq4 m14
+%define rq5 m15
+%else
+%define rq4 [Q4]
+%define rq5 [Q5]
+%endif
+ ABSSUB_GT m7, rq4, rq0, m6, m5 ; abs(q4 - q0) <= 1
+ por m1, m7
+ ABSSUB_GT m7, rq5, rq0, m6, m5 ; abs(q5 - q0) <= 1
+ por m1, m7
+%ifdef m8
+ mova m14, [Q6]
+ mova m15, [Q7]
+%define rq6 m14
+%define rq7 m15
+%else
+%define rq6 [Q6]
+%define rq7 [Q7]
+%endif
+ ABSSUB_GT m7, rq6, rq0, m6, m5 ; abs(q4 - q0) <= 1
+ por m1, m7
+ ABSSUB_GT m7, rq7, rq0, m6, m5 ; abs(q5 - q0) <= 1
+ por m1, m7 ; flat8out final value
+ pxor m1, [pb_ff]
+%endif
+
+ ; if (fm) {
+ ; if (out && in) filter_14()
+ ; else if (in) filter_6()
+ ; else if (hev) filter_2()
+ ; else filter_4()
+ ; }
+ ;
+ ; f14: fm & out & in
+ ; f6: fm & ~f14 & in => fm & ~(out & in) & in => fm & ~out & in
+ ; f2: fm & ~f14 & ~f6 & hev => fm & ~(out & in) & ~(~out & in) & hev => fm & ~in & hev
+ ; f4: fm & ~f14 & ~f6 & ~f2 => fm & ~(out & in) & ~(~out & in) & ~(~in & hev) => fm & ~in & ~hev
+
+ ; (m0: hev, [m1: flat8out], [m2: flat8in], m3: fm, m8..15: p5 p4 p1 p0 q0 q1 q6 q7)
+ ; filter2()
+%if %2 != 44 && %2 != 4
+ mova m6, [pb_80] ; already in m6 if 44_16
+ SCRATCH 2, 15, rsp+%3+%4
+%if %2 == 16
+ SCRATCH 1, 8, rsp+%3+%4+16
+%endif
+%endif
+ pxor m2, m6, rq0 ; q0 ^ 0x80
+ pxor m4, m6, rp0 ; p0 ^ 0x80
+ psubsb m2, m4 ; (signed) q0 - p0
+ pxor m4, m6, rp1 ; p1 ^ 0x80
+ pxor m5, m6, rq1 ; q1 ^ 0x80
+ psubsb m4, m5 ; (signed) p1 - q1
+ paddsb m4, m2 ; (q0 - p0) + (p1 - q1)
+ paddsb m4, m2 ; 2*(q0 - p0) + (p1 - q1)
+ paddsb m4, m2 ; 3*(q0 - p0) + (p1 - q1)
+ paddsb m6, m4, [pb_4] ; m6: f1 = clip(f + 4, 127)
+ paddsb m4, [pb_3] ; m4: f2 = clip(f + 3, 127)
+%ifdef m8
+ mova m14, [pb_10] ; will be reused in filter4()
+%define rb10 m14
+%else
+%define rb10 [pb_10]
+%endif
+ SRSHIFT3B_2X m6, m4, rb10, m7 ; f1 and f2 sign byte shift by 3
+ SIGN_SUB m7, rq0, m6, m5 ; m7 = q0 - f1
+ SIGN_ADD m1, rp0, m4, m5 ; m1 = p0 + f2
+%if %2 != 44 && %2 != 4
+%ifdef m8
+ pandn m6, m15, m3 ; ~mask(in) & mask(fm)
+%else
+ mova m6, [rsp+%3+%4]
+ pandn m6, m3
+%endif
+ pand m6, m0 ; (~mask(in) & mask(fm)) & mask(hev)
+%else
+ pand m6, m3, m0
+%endif
+ MASK_APPLY m7, rq0, m6, m5 ; m7 = filter2(q0) & mask / we write it in filter4()
+ MASK_APPLY m1, rp0, m6, m5 ; m1 = filter2(p0) & mask / we write it in filter4()
+
+ ; (m0: hev, m1: p0', m2: q0-p0, m3: fm, m7: q0', [m8: flat8out], m10..13: p1 p0 q0 q1, m14: pb_10, [m15: flat8in], )
+ ; filter4()
+ mova m4, m2
+ paddsb m2, m4 ; 2 * (q0 - p0)
+ paddsb m2, m4 ; 3 * (q0 - p0)
+ paddsb m6, m2, [pb_4] ; m6: f1 = clip(f + 4, 127)
+ paddsb m2, [pb_3] ; m2: f2 = clip(f + 3, 127)
+ SRSHIFT3B_2X m6, m2, rb10, m4 ; f1 and f2 sign byte shift by 3
+%if %2 != 44 && %2 != 4
+%ifdef m8
+ pandn m5, m15, m3 ; ~mask(in) & mask(fm)
+%else
+ mova m5, [rsp+%3+%4]
+ pandn m5, m3
+%endif
+ pandn m0, m5 ; ~mask(hev) & (~mask(in) & mask(fm))
+%else
+ pandn m0, m3
+%endif
+ SIGN_SUB m5, rq0, m6, m4 ; q0 - f1
+ MASK_APPLY m5, m7, m0, m4 ; filter4(q0) & mask
+ mova [Q0], m5
+ SIGN_ADD m7, rp0, m2, m4 ; p0 + f2
+ MASK_APPLY m7, m1, m0, m4 ; filter4(p0) & mask
+ mova [P0], m7
+ paddb m6, [pb_80] ;
+ pxor m1, m1 ; f=(f1+1)>>1
+ pavgb m6, m1 ;
+ psubb m6, [pb_40] ;
+ SIGN_ADD m1, rp1, m6, m2 ; p1 + f
+ SIGN_SUB m4, rq1, m6, m2 ; q1 - f
+ MASK_APPLY m1, rp1, m0, m2 ; m1 = filter4(p1)
+ MASK_APPLY m4, rq1, m0, m2 ; m4 = filter4(q1)
+ mova [P1], m1
+ mova [Q1], m4
+
+%if %2 != 44 && %2 != 4
+ UNSCRATCH 2, 15, rsp+%3+%4
+%endif
+
+ ; ([m1: flat8out], m2: flat8in, m3: fm, m10..13: p1 p0 q0 q1)
+ ; filter6()
+%if %2 != 44 && %2 != 4
+ pxor m0, m0
+%if %2 != 16
+ pand m3, m2
+%else
+ pand m2, m3 ; mask(fm) & mask(in)
+%ifdef m8
+ pandn m3, m8, m2 ; ~mask(out) & (mask(fm) & mask(in))
+%else
+ mova m3, [rsp+%3+%4+16]
+ pandn m3, m2
+%endif
+%endif
+%ifdef m8
+ mova m14, [P3]
+ mova m9, [Q3]
+%define rp3 m14
+%define rq3 m9
+%else
+%define rp3 [P3]
+%define rq3 [Q3]
+%endif
+ mova m1, [P2]
+ FILTER_INIT m4, m5, m6, m7, [P2], %4, 6, m3, m1 ; [p2]
+ mova m1, [Q2]
+ FILTER_UPDATE m4, m5, m6, m7, [P1], %4, 0, 1, 2, 5, 3, m3, "", rq1, "", 1 ; [p1] -p3 -p2 +p1 +q1
+ FILTER_UPDATE m4, m5, m6, m7, [P0], %4, 0, 2, 3, 6, 3, m3, "", m1 ; [p0] -p3 -p1 +p0 +q2
+ FILTER_UPDATE m4, m5, m6, m7, [Q0], %4, 0, 3, 4, 7, 3, m3, "", rq3, "", 1 ; [q0] -p3 -p0 +q0 +q3
+ FILTER_UPDATE m4, m5, m6, m7, [Q1], %4, 1, 4, 5, 7, 3, m3, "" ; [q1] -p2 -q0 +q1 +q3
+ FILTER_UPDATE m4, m5, m6, m7, [Q2], %4, 2, 5, 6, 7, 3, m3, m1 ; [q2] -p1 -q1 +q2 +q3
+%endif
+
+%if %2 == 16
+ UNSCRATCH 1, 8, rsp+%3+%4+16
+%endif
+
+ ; (m0: 0, [m1: flat8out], m2: fm & flat8in, m8..15: q2 q3 p1 p0 q0 q1 p3 p2)
+ ; filter14()
+ ;
+ ; m2 m3 m8 m9 m14 m15 m10 m11 m12 m13
+ ;
+ ; q2 q3 p3 p2 p1 p0 q0 q1
+ ; p6 -7 p7 p6 p5 p4 . . . . .
+ ; p5 -6 -p7 -p6 +p5 +q1 . . . .
+ ; p4 -5 -p7 -p5 +p4 +q2 . . . q2
+ ; p3 -4 -p7 -p4 +p3 +q3 . . . q3
+ ; p2 -3 -p7 -p3 +p2 +q4 . . . q4
+ ; p1 -2 -p7 -p2 +p1 +q5 . . . q5
+ ; p0 -1 -p7 -p1 +p0 +q6 . . . q6
+ ; q0 +0 -p7 -p0 +q0 +q7 . . . q7
+ ; q1 +1 -p6 -q0 +q1 +q7 q1 . . .
+ ; q2 +2 -p5 -q1 +q2 +q7 . q2 . .
+ ; q3 +3 -p4 -q2 +q3 +q7 . q3 . .
+ ; q4 +4 -p3 -q3 +q4 +q7 . q4 . .
+ ; q5 +5 -p2 -q4 +q5 +q7 . q5 . .
+ ; q6 +6 -p1 -q5 +q6 +q7 . q6 . .
+
+%if %2 == 16
+ pand m1, m2 ; mask(out) & (mask(fm) & mask(in))
+ mova m2, [P7]
+ mova m3, [P6]
+%ifdef m8
+ mova m8, [P5]
+ mova m9, [P4]
+%define rp5 m8
+%define rp4 m9
+%define rp5s m8
+%define rp4s m9
+%define rp3s m14
+%define rq4 m8
+%define rq5 m9
+%define rq6 m14
+%define rq7 m15
+%define rq4s m8
+%define rq5s m9
+%define rq6s m14
+%else
+%define rp5 [P5]
+%define rp4 [P4]
+%define rp5s ""
+%define rp4s ""
+%define rp3s ""
+%define rq4 [Q4]
+%define rq5 [Q5]
+%define rq6 [Q6]
+%define rq7 [Q7]
+%define rq4s ""
+%define rq5s ""
+%define rq6s ""
+%endif
+ FILTER_INIT m4, m5, m6, m7, [P6], %4, 14, m1, m3 ; [p6]
+ FILTER_UPDATE m4, m5, m6, m7, [P5], %4, 8, 9, 10, 5, 4, m1, rp5s ; [p5] -p7 -p6 +p5 +q1
+ FILTER_UPDATE m4, m5, m6, m7, [P4], %4, 8, 10, 11, 6, 4, m1, rp4s ; [p4] -p7 -p5 +p4 +q2
+ FILTER_UPDATE m4, m5, m6, m7, [P3], %4, 8, 11, 0, 7, 4, m1, rp3s ; [p3] -p7 -p4 +p3 +q3
+ FILTER_UPDATE m4, m5, m6, m7, [P2], %4, 8, 0, 1, 12, 4, m1, "", rq4, [Q4], 1 ; [p2] -p7 -p3 +p2 +q4
+ FILTER_UPDATE m4, m5, m6, m7, [P1], %4, 8, 1, 2, 13, 4, m1, "", rq5, [Q5], 1 ; [p1] -p7 -p2 +p1 +q5
+ FILTER_UPDATE m4, m5, m6, m7, [P0], %4, 8, 2, 3, 14, 4, m1, "", rq6, [Q6], 1 ; [p0] -p7 -p1 +p0 +q6
+ FILTER_UPDATE m4, m5, m6, m7, [Q0], %4, 8, 3, 4, 15, 4, m1, "", rq7, [Q7], 1 ; [q0] -p7 -p0 +q0 +q7
+ FILTER_UPDATE m4, m5, m6, m7, [Q1], %4, 9, 4, 5, 15, 4, m1, "" ; [q1] -p6 -q0 +q1 +q7
+ FILTER_UPDATE m4, m5, m6, m7, [Q2], %4, 10, 5, 6, 15, 4, m1, "" ; [q2] -p5 -q1 +q2 +q7
+ FILTER_UPDATE m4, m5, m6, m7, [Q3], %4, 11, 6, 7, 15, 4, m1, "" ; [q3] -p4 -q2 +q3 +q7
+ FILTER_UPDATE m4, m5, m6, m7, [Q4], %4, 0, 7, 12, 15, 4, m1, rq4s ; [q4] -p3 -q3 +q4 +q7
+ FILTER_UPDATE m4, m5, m6, m7, [Q5], %4, 1, 12, 13, 15, 4, m1, rq5s ; [q5] -p2 -q4 +q5 +q7
+ FILTER_UPDATE m4, m5, m6, m7, [Q6], %4, 2, 13, 14, 15, 4, m1, rq6s ; [q6] -p1 -q5 +q6 +q7
+%endif
+
+%ifidn %1, h
+%if %2 == 16
+ mova m0, [P7]
+ mova m1, [P6]
+ mova m2, [P5]
+ mova m3, [P4]
+ mova m4, [P3]
+ mova m5, [P2]
+%if ARCH_X86_64
+ mova m6, [P1]
+%endif
+ mova m7, [P0]
+%if ARCH_X86_64
+ mova m8, [Q0]
+ mova m9, [Q1]
+ mova m10, [Q2]
+ mova m11, [Q3]
+ mova m12, [Q4]
+ mova m13, [Q5]
+ mova m14, [Q6]
+ mova m15, [Q7]
+ TRANSPOSE16x16B 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, [rsp]
+ DEFINE_REAL_P7_TO_Q7
+ movu [P7], m0
+ movu [P6], m1
+ movu [P5], m2
+ movu [P4], m3
+ movu [P3], m4
+ movu [P2], m5
+ movu [P1], m6
+ movu [P0], m7
+ movu [Q0], m8
+ movu [Q1], m9
+ movu [Q2], m10
+ movu [Q3], m11
+ movu [Q4], m12
+ movu [Q5], m13
+ movu [Q6], m14
+ movu [Q7], m15
+%else
+ DEFINE_REAL_P7_TO_Q7
+ TRANSPOSE8x8B 0, 1, 2, 3, 4, 5, 6, 7, [rsp+32], a, [rsp+%3+%4], [Q0], [Q1]
+ movh [P7], m0
+ movh [P5], m1
+ movh [P3], m2
+ movh [P1], m3
+ movh [Q2], m5
+ movh [Q4], m6
+ movh [Q6], m7
+ movhps [P6], m0
+ movhps [P4], m1
+ movhps [P2], m2
+ movhps [P0], m3
+ movhps [Q3], m5
+ movhps [Q5], m6
+ movhps [Q7], m7
+ DEFINE_TRANSPOSED_P7_TO_Q7
+ mova m0, [Q0]
+ mova m1, [Q1]
+ mova m2, [Q2]
+ mova m3, [Q3]
+ mova m4, [Q4]
+ mova m5, [Q5]
+ mova m7, [Q7]
+ DEFINE_REAL_P7_TO_Q7 8
+ TRANSPOSE8x8B 0, 1, 2, 3, 4, 5, 6, 7, [rsp+224], a, [rsp+%3+%4], [Q0], [Q1]
+ movh [P7], m0
+ movh [P5], m1
+ movh [P3], m2
+ movh [P1], m3
+ movh [Q2], m5
+ movh [Q4], m6
+ movh [Q6], m7
+ movhps [P6], m0
+ movhps [P4], m1
+ movhps [P2], m2
+ movhps [P0], m3
+ movhps [Q3], m5
+ movhps [Q5], m6
+ movhps [Q7], m7
+%endif
+%elif %2 == 44 || %2 == 4
+ SWAP 0, 1 ; m0 = p1
+ SWAP 1, 7 ; m1 = p0
+ SWAP 2, 5 ; m2 = q0
+ SWAP 3, 4 ; m3 = q1
+ DEFINE_REAL_P7_TO_Q7 2
+ SBUTTERFLY bw, 0, 1, 4
+ SBUTTERFLY bw, 2, 3, 4
+ SBUTTERFLY wd, 0, 2, 4
+ SBUTTERFLY wd, 1, 3, 4
+%if mmsize == 16
+ movd [P7], m0
+ movd [P3], m2
+ movd [Q0], m1
+ movd [Q4], m3
+ psrldq m0, 4
+ psrldq m1, 4
+ psrldq m2, 4
+ psrldq m3, 4
+ movd [P6], m0
+ movd [P2], m2
+ movd [Q1], m1
+ movd [Q5], m3
+ psrldq m0, 4
+ psrldq m1, 4
+ psrldq m2, 4
+ psrldq m3, 4
+ movd [P5], m0
+ movd [P1], m2
+ movd [Q2], m1
+ movd [Q6], m3
+ psrldq m0, 4
+ psrldq m1, 4
+ psrldq m2, 4
+ psrldq m3, 4
+ movd [P4], m0
+ movd [P0], m2
+ movd [Q3], m1
+ movd [Q7], m3
+%else
+ movd [P7], m0
+ movd [P5], m2
+ movd [P3], m1
+ movd [P1], m3
+ psrlq m0, 32
+ psrlq m2, 32
+ psrlq m1, 32
+ psrlq m3, 32
+ movd [P6], m0
+ movd [P4], m2
+ movd [P2], m1
+ movd [P0], m3
+%endif
+%else
+ ; the following code do a transpose of 8 full lines to 16 half
+ ; lines (high part). It is inlined to avoid the need of a staging area
+ mova m0, [P3]
+ mova m1, [P2]
+ mova m2, [P1]
+ mova m3, [P0]
+ mova m4, [Q0]
+ mova m5, [Q1]
+%ifdef m8
+ mova m6, [Q2]
+%endif
+ mova m7, [Q3]
+ DEFINE_REAL_P7_TO_Q7
+%ifdef m8
+ SBUTTERFLY bw, 0, 1, 8
+ SBUTTERFLY bw, 2, 3, 8
+ SBUTTERFLY bw, 4, 5, 8
+ SBUTTERFLY bw, 6, 7, 8
+ SBUTTERFLY wd, 0, 2, 8
+ SBUTTERFLY wd, 1, 3, 8
+ SBUTTERFLY wd, 4, 6, 8
+ SBUTTERFLY wd, 5, 7, 8
+ SBUTTERFLY dq, 0, 4, 8
+ SBUTTERFLY dq, 1, 5, 8
+ SBUTTERFLY dq, 2, 6, 8
+ SBUTTERFLY dq, 3, 7, 8
+%else
+ SBUTTERFLY bw, 0, 1, 6
+ mova [rsp+mmsize*4], m1
+ mova m6, [rsp+mmsize*6]
+ SBUTTERFLY bw, 2, 3, 1
+ SBUTTERFLY bw, 4, 5, 1
+ SBUTTERFLY bw, 6, 7, 1
+ SBUTTERFLY wd, 0, 2, 1
+ mova [rsp+mmsize*6], m2
+ mova m1, [rsp+mmsize*4]
+ SBUTTERFLY wd, 1, 3, 2
+ SBUTTERFLY wd, 4, 6, 2
+ SBUTTERFLY wd, 5, 7, 2
+ SBUTTERFLY dq, 0, 4, 2
+ SBUTTERFLY dq, 1, 5, 2
+%if mmsize == 16
+ movh [Q0], m1
+ movhps [Q1], m1
+%else
+ mova [P3], m1
+%endif
+ mova m2, [rsp+mmsize*6]
+ SBUTTERFLY dq, 2, 6, 1
+ SBUTTERFLY dq, 3, 7, 1
+%endif
+ SWAP 3, 6
+ SWAP 1, 4
+%if mmsize == 16
+ movh [P7], m0
+ movhps [P6], m0
+ movh [P5], m1
+ movhps [P4], m1
+ movh [P3], m2
+ movhps [P2], m2
+ movh [P1], m3
+ movhps [P0], m3
+%ifdef m8
+ movh [Q0], m4
+ movhps [Q1], m4
+%endif
+ movh [Q2], m5
+ movhps [Q3], m5
+ movh [Q4], m6
+ movhps [Q5], m6
+ movh [Q6], m7
+ movhps [Q7], m7
+%else
+ mova [P7], m0
+ mova [P6], m1
+ mova [P5], m2
+ mova [P4], m3
+ mova [P2], m5
+ mova [P1], m6
+ mova [P0], m7
+%endif
+%endif
+%endif
+
+ RET
+%endmacro
+
+%macro LPF_16_VH 5
+INIT_XMM %5
+LOOPFILTER v, %1, %2, 0, %4
+LOOPFILTER h, %1, %2, %3, %4
+%endmacro
+
+%macro LPF_16_VH_ALL_OPTS 4
+LPF_16_VH %1, %2, %3, %4, sse2
+LPF_16_VH %1, %2, %3, %4, ssse3
+LPF_16_VH %1, %2, %3, %4, avx
+%endmacro
+
+LPF_16_VH_ALL_OPTS 16, 512, 256, 32
+LPF_16_VH_ALL_OPTS 44, 0, 128, 0
+LPF_16_VH_ALL_OPTS 48, 256, 128, 16
+LPF_16_VH_ALL_OPTS 84, 256, 128, 16
+LPF_16_VH_ALL_OPTS 88, 256, 128, 16
+
+INIT_MMX mmxext
+LOOPFILTER v, 4, 0, 0, 0
+LOOPFILTER h, 4, 0, 64, 0
+LOOPFILTER v, 8, 128, 0, 8
+LOOPFILTER h, 8, 128, 64, 8
diff --git a/libs/ffvpx/libavcodec/x86/vp9lpf_16bpp.asm b/libs/ffvpx/libavcodec/x86/vp9lpf_16bpp.asm
new file mode 100644
index 000000000..c0888170c
--- /dev/null
+++ b/libs/ffvpx/libavcodec/x86/vp9lpf_16bpp.asm
@@ -0,0 +1,823 @@
+;******************************************************************************
+;* VP9 loop filter SIMD optimizations
+;*
+;* Copyright (C) 2015 Ronald S. Bultje <rsbultje@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pw_511: times 16 dw 511
+pw_2047: times 16 dw 2047
+pw_16384: times 16 dw 16384
+pw_m512: times 16 dw -512
+pw_m2048: times 16 dw -2048
+
+cextern pw_1
+cextern pw_3
+cextern pw_4
+cextern pw_8
+cextern pw_16
+cextern pw_256
+cextern pw_1023
+cextern pw_4095
+cextern pw_m1
+
+SECTION .text
+
+%macro SCRATCH 3-4
+%if ARCH_X86_64
+ SWAP %1, %2
+%if %0 == 4
+%define reg_%4 m%2
+%endif
+%else
+ mova [%3], m%1
+%if %0 == 4
+%define reg_%4 [%3]
+%endif
+%endif
+%endmacro
+
+%macro UNSCRATCH 3-4
+%if ARCH_X86_64
+ SWAP %1, %2
+%else
+ mova m%1, [%3]
+%endif
+%if %0 == 4
+%undef reg_%4
+%endif
+%endmacro
+
+%macro PRELOAD 2-3
+%if ARCH_X86_64
+ mova m%1, [%2]
+%if %0 == 3
+%define reg_%3 m%1
+%endif
+%elif %0 == 3
+%define reg_%3 [%2]
+%endif
+%endmacro
+
+; calculate p or q portion of flat8out
+%macro FLAT8OUT_HALF 0
+ psubw m4, m0 ; q4-q0
+ psubw m5, m0 ; q5-q0
+ psubw m6, m0 ; q6-q0
+ psubw m7, m0 ; q7-q0
+ ABS2 m4, m5, m2, m3 ; abs(q4-q0) | abs(q5-q0)
+ ABS2 m6, m7, m2, m3 ; abs(q6-q0) | abs(q7-q0)
+ pcmpgtw m4, reg_F ; abs(q4-q0) > F
+ pcmpgtw m5, reg_F ; abs(q5-q0) > F
+ pcmpgtw m6, reg_F ; abs(q6-q0) > F
+ pcmpgtw m7, reg_F ; abs(q7-q0) > F
+ por m5, m4
+ por m7, m6
+ por m7, m5 ; !flat8out, q portion
+%endmacro
+
+; calculate p or q portion of flat8in/hev/fm (excluding mb_edge condition)
+%macro FLAT8IN_HALF 1
+%if %1 > 4
+ psubw m4, m3, m0 ; q3-q0
+ psubw m5, m2, m0 ; q2-q0
+ ABS2 m4, m5, m6, m7 ; abs(q3-q0) | abs(q2-q0)
+ pcmpgtw m4, reg_F ; abs(q3-q0) > F
+ pcmpgtw m5, reg_F ; abs(q2-q0) > F
+%endif
+ psubw m3, m2 ; q3-q2
+ psubw m2, m1 ; q2-q1
+ ABS2 m3, m2, m6, m7 ; abs(q3-q2) | abs(q2-q1)
+ pcmpgtw m3, reg_I ; abs(q3-q2) > I
+ pcmpgtw m2, reg_I ; abs(q2-q1) > I
+%if %1 > 4
+ por m4, m5
+%endif
+ por m2, m3
+ psubw m3, m1, m0 ; q1-q0
+ ABS1 m3, m5 ; abs(q1-q0)
+%if %1 > 4
+ pcmpgtw m6, m3, reg_F ; abs(q1-q0) > F
+%endif
+ pcmpgtw m7, m3, reg_H ; abs(q1-q0) > H
+ pcmpgtw m3, reg_I ; abs(q1-q0) > I
+%if %1 > 4
+ por m4, m6
+%endif
+ por m2, m3
+%endmacro
+
+; one step in filter_14/filter_6
+;
+; take sum $reg, downshift, apply mask and write into dst
+;
+; if sub2/add1-2 are present, add/sub as appropriate to prepare for the next
+; step's sum $reg. This is omitted for the last row in each filter.
+;
+; if dont_store is set, don't write the result into memory, instead keep the
+; values in register so we can write it out later
+%macro FILTER_STEP 6-10 "", "", "", 0 ; tmp, reg, mask, shift, dst, \
+ ; src/sub1, sub2, add1, add2, dont_store
+ psrlw %1, %2, %4
+ psubw %1, %6 ; abs->delta
+%ifnidn %7, ""
+ psubw %2, %6
+ psubw %2, %7
+ paddw %2, %8
+ paddw %2, %9
+%endif
+ pand %1, reg_%3 ; apply mask
+%if %10 == 1
+ paddw %6, %1 ; delta->abs
+%else
+ paddw %1, %6 ; delta->abs
+ mova [%5], %1
+%endif
+%endmacro
+
+; FIXME avx2 versions for 16_16 and mix2_{4,8}{4,8}
+
+%macro LOOP_FILTER 3 ; dir[h/v], wd[4/8/16], bpp[10/12]
+
+%if ARCH_X86_64
+%if %2 == 16
+%assign %%num_xmm_regs 16
+%elif %2 == 8
+%assign %%num_xmm_regs 15
+%else ; %2 == 4
+%assign %%num_xmm_regs 14
+%endif ; %2
+%assign %%bak_mem 0
+%else ; ARCH_X86_32
+%assign %%num_xmm_regs 8
+%if %2 == 16
+%assign %%bak_mem 7
+%elif %2 == 8
+%assign %%bak_mem 6
+%else ; %2 == 4
+%assign %%bak_mem 5
+%endif ; %2
+%endif ; ARCH_X86_64/32
+
+%if %2 == 16
+%ifidn %1, v
+%assign %%num_gpr_regs 6
+%else ; %1 == h
+%assign %%num_gpr_regs 5
+%endif ; %1
+%assign %%wd_mem 6
+%else ; %2 == 8/4
+%assign %%num_gpr_regs 5
+%if ARCH_X86_32 && %2 == 8
+%assign %%wd_mem 2
+%else ; ARCH_X86_64 || %2 == 4
+%assign %%wd_mem 0
+%endif ; ARCH_X86_64/32 etc.
+%endif ; %2
+
+%ifidn %1, v
+%assign %%tsp_mem 0
+%elif %2 == 16 ; && %1 == h
+%assign %%tsp_mem 16
+%else ; %1 == h && %1 == 8/4
+%assign %%tsp_mem 8
+%endif ; %1/%2
+
+%assign %%off %%wd_mem
+%assign %%tspoff %%bak_mem+%%wd_mem
+%assign %%stack_mem ((%%bak_mem+%%wd_mem+%%tsp_mem)*mmsize)
+
+%if %3 == 10
+%define %%maxsgn 511
+%define %%minsgn m512
+%define %%maxusgn 1023
+%define %%maxf 4
+%else ; %3 == 12
+%define %%maxsgn 2047
+%define %%minsgn m2048
+%define %%maxusgn 4095
+%define %%maxf 16
+%endif ; %3
+
+cglobal vp9_loop_filter_%1_%2_%3, 5, %%num_gpr_regs, %%num_xmm_regs, %%stack_mem, dst, stride, E, I, H
+ ; prepare E, I and H masks
+ shl Ed, %3-8
+ shl Id, %3-8
+ shl Hd, %3-8
+%if cpuflag(ssse3)
+ mova m0, [pw_256]
+%endif
+ movd m1, Ed
+ movd m2, Id
+ movd m3, Hd
+%if cpuflag(ssse3)
+ pshufb m1, m0 ; E << (bit_depth - 8)
+ pshufb m2, m0 ; I << (bit_depth - 8)
+ pshufb m3, m0 ; H << (bit_depth - 8)
+%else
+ punpcklwd m1, m1
+ punpcklwd m2, m2
+ punpcklwd m3, m3
+ pshufd m1, m1, q0000
+ pshufd m2, m2, q0000
+ pshufd m3, m3, q0000
+%endif
+ SCRATCH 1, 8, rsp+(%%off+0)*mmsize, E
+ SCRATCH 2, 9, rsp+(%%off+1)*mmsize, I
+ SCRATCH 3, 10, rsp+(%%off+2)*mmsize, H
+%if %2 > 4
+ PRELOAD 11, pw_ %+ %%maxf, F
+%endif
+
+ ; set up variables to load data
+%ifidn %1, v
+ DEFINE_ARGS dst8, stride, stride3, dst0, dst4, dst12
+ lea stride3q, [strideq*3]
+ neg strideq
+%if %2 == 16
+ lea dst0q, [dst8q+strideq*8]
+%else
+ lea dst4q, [dst8q+strideq*4]
+%endif
+ neg strideq
+%if %2 == 16
+ lea dst12q, [dst8q+strideq*4]
+ lea dst4q, [dst0q+strideq*4]
+%endif
+
+%if %2 == 16
+%define %%p7 dst0q
+%define %%p6 dst0q+strideq
+%define %%p5 dst0q+strideq*2
+%define %%p4 dst0q+stride3q
+%endif
+%define %%p3 dst4q
+%define %%p2 dst4q+strideq
+%define %%p1 dst4q+strideq*2
+%define %%p0 dst4q+stride3q
+%define %%q0 dst8q
+%define %%q1 dst8q+strideq
+%define %%q2 dst8q+strideq*2
+%define %%q3 dst8q+stride3q
+%if %2 == 16
+%define %%q4 dst12q
+%define %%q5 dst12q+strideq
+%define %%q6 dst12q+strideq*2
+%define %%q7 dst12q+stride3q
+%endif
+%else ; %1 == h
+ DEFINE_ARGS dst0, stride, stride3, dst4
+ lea stride3q, [strideq*3]
+ lea dst4q, [dst0q+strideq*4]
+
+%define %%p3 rsp+(%%tspoff+0)*mmsize
+%define %%p2 rsp+(%%tspoff+1)*mmsize
+%define %%p1 rsp+(%%tspoff+2)*mmsize
+%define %%p0 rsp+(%%tspoff+3)*mmsize
+%define %%q0 rsp+(%%tspoff+4)*mmsize
+%define %%q1 rsp+(%%tspoff+5)*mmsize
+%define %%q2 rsp+(%%tspoff+6)*mmsize
+%define %%q3 rsp+(%%tspoff+7)*mmsize
+
+%if %2 < 16
+ movu m0, [dst0q+strideq*0-8]
+ movu m1, [dst0q+strideq*1-8]
+ movu m2, [dst0q+strideq*2-8]
+ movu m3, [dst0q+stride3q -8]
+ movu m4, [dst4q+strideq*0-8]
+ movu m5, [dst4q+strideq*1-8]
+ movu m6, [dst4q+strideq*2-8]
+ movu m7, [dst4q+stride3q -8]
+
+%if ARCH_X86_64
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 12
+%else
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%%p0], [%%q0]
+%endif
+
+ mova [%%p3], m0
+ mova [%%p2], m1
+ mova [%%p1], m2
+ mova [%%p0], m3
+%if ARCH_X86_64
+ mova [%%q0], m4
+%endif
+ mova [%%q1], m5
+ mova [%%q2], m6
+ mova [%%q3], m7
+
+ ; FIXME investigate if we can _not_ load q0-3 below if h, and adjust register
+ ; order here accordingly
+%else ; %2 == 16
+
+%define %%p7 rsp+(%%tspoff+ 8)*mmsize
+%define %%p6 rsp+(%%tspoff+ 9)*mmsize
+%define %%p5 rsp+(%%tspoff+10)*mmsize
+%define %%p4 rsp+(%%tspoff+11)*mmsize
+%define %%q4 rsp+(%%tspoff+12)*mmsize
+%define %%q5 rsp+(%%tspoff+13)*mmsize
+%define %%q6 rsp+(%%tspoff+14)*mmsize
+%define %%q7 rsp+(%%tspoff+15)*mmsize
+
+ mova m0, [dst0q+strideq*0-16]
+ mova m1, [dst0q+strideq*1-16]
+ mova m2, [dst0q+strideq*2-16]
+ mova m3, [dst0q+stride3q -16]
+ mova m4, [dst4q+strideq*0-16]
+ mova m5, [dst4q+strideq*1-16]
+%if ARCH_X86_64
+ mova m6, [dst4q+strideq*2-16]
+%endif
+ mova m7, [dst4q+stride3q -16]
+
+%if ARCH_X86_64
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 12
+%else
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [dst4q+strideq*2-16], [%%p3], 1
+%endif
+
+ mova [%%p7], m0
+ mova [%%p6], m1
+ mova [%%p5], m2
+ mova [%%p4], m3
+%if ARCH_X86_64
+ mova [%%p3], m4
+%endif
+ mova [%%p2], m5
+ mova [%%p1], m6
+ mova [%%p0], m7
+
+ mova m0, [dst0q+strideq*0]
+ mova m1, [dst0q+strideq*1]
+ mova m2, [dst0q+strideq*2]
+ mova m3, [dst0q+stride3q ]
+ mova m4, [dst4q+strideq*0]
+ mova m5, [dst4q+strideq*1]
+%if ARCH_X86_64
+ mova m6, [dst4q+strideq*2]
+%endif
+ mova m7, [dst4q+stride3q ]
+
+%if ARCH_X86_64
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 12
+%else
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [dst4q+strideq*2], [%%q4], 1
+%endif
+
+ mova [%%q0], m0
+ mova [%%q1], m1
+ mova [%%q2], m2
+ mova [%%q3], m3
+%if ARCH_X86_64
+ mova [%%q4], m4
+%endif
+ mova [%%q5], m5
+ mova [%%q6], m6
+ mova [%%q7], m7
+
+ ; FIXME investigate if we can _not_ load q0|q4-7 below if h, and adjust register
+ ; order here accordingly
+%endif ; %2
+%endif ; %1
+
+ ; load q0|q4-7 data
+ mova m0, [%%q0]
+%if %2 == 16
+ mova m4, [%%q4]
+ mova m5, [%%q5]
+ mova m6, [%%q6]
+ mova m7, [%%q7]
+
+ ; flat8out q portion
+ FLAT8OUT_HALF
+ SCRATCH 7, 15, rsp+(%%off+6)*mmsize, F8O
+%endif
+
+ ; load q1-3 data
+ mova m1, [%%q1]
+ mova m2, [%%q2]
+ mova m3, [%%q3]
+
+ ; r6-8|pw_4[m8-11]=reg_E/I/H/F
+ ; r9[m15]=!flatout[q]
+ ; m12-14=free
+ ; m0-3=q0-q3
+ ; m4-7=free
+
+ ; flat8in|fm|hev q portion
+ FLAT8IN_HALF %2
+ SCRATCH 7, 13, rsp+(%%off+4)*mmsize, HEV
+%if %2 > 4
+ SCRATCH 4, 14, rsp+(%%off+5)*mmsize, F8I
+%endif
+
+ ; r6-8|pw_4[m8-11]=reg_E/I/H/F
+ ; r9[m15]=!flat8out[q]
+ ; r10[m13]=hev[q]
+ ; r11[m14]=!flat8in[q]
+ ; m2=!fm[q]
+ ; m0,1=q0-q1
+ ; m2-7=free
+ ; m12=free
+
+ ; load p0-1
+ mova m3, [%%p0]
+ mova m4, [%%p1]
+
+ ; fm mb_edge portion
+ psubw m5, m3, m0 ; q0-p0
+ psubw m6, m4, m1 ; q1-p1
+%if ARCH_X86_64
+ ABS2 m5, m6, m7, m12 ; abs(q0-p0) | abs(q1-p1)
+%else
+ ABS1 m5, m7 ; abs(q0-p0)
+ ABS1 m6, m7 ; abs(q1-p1)
+%endif
+ paddw m5, m5
+ psraw m6, 1
+ paddw m6, m5 ; abs(q0-p0)*2+(abs(q1-p1)>>1)
+ pcmpgtw m6, reg_E
+ por m2, m6
+ SCRATCH 2, 12, rsp+(%%off+3)*mmsize, FM
+
+ ; r6-8|pw_4[m8-11]=reg_E/I/H/F
+ ; r9[m15]=!flat8out[q]
+ ; r10[m13]=hev[q]
+ ; r11[m14]=!flat8in[q]
+ ; r12[m12]=!fm[q]
+ ; m3-4=q0-1
+ ; m0-2/5-7=free
+
+ ; load p4-7 data
+ SWAP 3, 0 ; p0
+ SWAP 4, 1 ; p1
+%if %2 == 16
+ mova m7, [%%p7]
+ mova m6, [%%p6]
+ mova m5, [%%p5]
+ mova m4, [%%p4]
+
+ ; flat8out p portion
+ FLAT8OUT_HALF
+ por m7, reg_F8O
+ SCRATCH 7, 15, rsp+(%%off+6)*mmsize, F8O
+%endif
+
+ ; r6-8|pw_4[m8-11]=reg_E/I/H/F
+ ; r9[m15]=!flat8out
+ ; r10[m13]=hev[q]
+ ; r11[m14]=!flat8in[q]
+ ; r12[m12]=!fm[q]
+ ; m0=p0
+ ; m1-7=free
+
+ ; load p2-3 data
+ mova m2, [%%p2]
+ mova m3, [%%p3]
+
+ ; flat8in|fm|hev p portion
+ FLAT8IN_HALF %2
+ por m7, reg_HEV
+%if %2 > 4
+ por m4, reg_F8I
+%endif
+ por m2, reg_FM
+%if %2 > 4
+ por m4, m2 ; !flat8|!fm
+%if %2 == 16
+ por m5, m4, reg_F8O ; !flat16|!fm
+ pandn m2, m4 ; filter4_mask
+ pandn m4, m5 ; filter8_mask
+ pxor m5, [pw_m1] ; filter16_mask
+ SCRATCH 5, 15, rsp+(%%off+6)*mmsize, F16M
+%else
+ pandn m2, m4 ; filter4_mask
+ pxor m4, [pw_m1] ; filter8_mask
+%endif
+ SCRATCH 4, 14, rsp+(%%off+5)*mmsize, F8M
+%else
+ pxor m2, [pw_m1] ; filter4_mask
+%endif
+ SCRATCH 7, 13, rsp+(%%off+4)*mmsize, HEV
+ SCRATCH 2, 12, rsp+(%%off+3)*mmsize, F4M
+
+ ; r9[m15]=filter16_mask
+ ; r10[m13]=hev
+ ; r11[m14]=filter8_mask
+ ; r12[m12]=filter4_mask
+ ; m0,1=p0-p1
+ ; m2-7=free
+ ; m8-11=free
+
+%if %2 > 4
+%if %2 == 16
+ ; filter_14
+ mova m2, [%%p7]
+ mova m3, [%%p6]
+ mova m6, [%%p5]
+ mova m7, [%%p4]
+ PRELOAD 8, %%p3, P3
+ PRELOAD 9, %%p2, P2
+%endif
+ PRELOAD 10, %%q0, Q0
+ PRELOAD 11, %%q1, Q1
+%if %2 == 16
+ psllw m4, m2, 3
+ paddw m5, m3, m3
+ paddw m4, m6
+ paddw m5, m7
+ paddw m4, reg_P3
+ paddw m5, reg_P2
+ paddw m4, m1
+ paddw m5, m0
+ paddw m4, reg_Q0 ; q0+p1+p3+p5+p7*8
+ psubw m5, m2 ; p0+p2+p4+p6*2-p7
+ paddw m4, [pw_8]
+ paddw m5, m4 ; q0+p0+p1+p2+p3+p4+p5+p6*2+p7*7+8
+
+ ; below, we use r0-5 for storing pre-filter pixels for subsequent subtraction
+ ; at the end of the filter
+
+ mova [rsp+0*mmsize], m3
+ FILTER_STEP m4, m5, F16M, 4, %%p6, m3, m2, m6, reg_Q1
+%endif
+ mova m3, [%%q2]
+%if %2 == 16
+ mova [rsp+1*mmsize], m6
+ FILTER_STEP m4, m5, F16M, 4, %%p5, m6, m2, m7, m3
+%endif
+ mova m6, [%%q3]
+%if %2 == 16
+ mova [rsp+2*mmsize], m7
+ FILTER_STEP m4, m5, F16M, 4, %%p4, m7, m2, reg_P3, m6
+ mova m7, [%%q4]
+%if ARCH_X86_64
+ mova [rsp+3*mmsize], reg_P3
+%else
+ mova m4, reg_P3
+ mova [rsp+3*mmsize], m4
+%endif
+ FILTER_STEP m4, m5, F16M, 4, %%p3, reg_P3, m2, reg_P2, m7
+ PRELOAD 8, %%q5, Q5
+%if ARCH_X86_64
+ mova [rsp+4*mmsize], reg_P2
+%else
+ mova m4, reg_P2
+ mova [rsp+4*mmsize], m4
+%endif
+ FILTER_STEP m4, m5, F16M, 4, %%p2, reg_P2, m2, m1, reg_Q5
+ PRELOAD 9, %%q6, Q6
+ mova [rsp+5*mmsize], m1
+ FILTER_STEP m4, m5, F16M, 4, %%p1, m1, m2, m0, reg_Q6
+ mova m1, [%%q7]
+ FILTER_STEP m4, m5, F16M, 4, %%p0, m0, m2, reg_Q0, m1, 1
+ FILTER_STEP m4, m5, F16M, 4, %%q0, reg_Q0, [rsp+0*mmsize], reg_Q1, m1, ARCH_X86_64
+ FILTER_STEP m4, m5, F16M, 4, %%q1, reg_Q1, [rsp+1*mmsize], m3, m1, ARCH_X86_64
+ FILTER_STEP m4, m5, F16M, 4, %%q2, m3, [rsp+2*mmsize], m6, m1, 1
+ FILTER_STEP m4, m5, F16M, 4, %%q3, m6, [rsp+3*mmsize], m7, m1
+ FILTER_STEP m4, m5, F16M, 4, %%q4, m7, [rsp+4*mmsize], reg_Q5, m1
+ FILTER_STEP m4, m5, F16M, 4, %%q5, reg_Q5, [rsp+5*mmsize], reg_Q6, m1
+ FILTER_STEP m4, m5, F16M, 4, %%q6, reg_Q6
+
+ mova m7, [%%p1]
+%else
+ SWAP 1, 7
+%endif
+
+ mova m2, [%%p3]
+ mova m1, [%%p2]
+
+ ; reg_Q0-1 (m10-m11)
+ ; m0=p0
+ ; m1=p2
+ ; m2=p3
+ ; m3=q2
+ ; m4-5=free
+ ; m6=q3
+ ; m7=p1
+ ; m8-9 unused
+
+ ; filter_6
+ psllw m4, m2, 2
+ paddw m5, m1, m1
+ paddw m4, m7
+ psubw m5, m2
+ paddw m4, m0
+ paddw m5, reg_Q0
+ paddw m4, [pw_4]
+ paddw m5, m4
+
+%if ARCH_X86_64
+ mova m8, m1
+ mova m9, m7
+%else
+ mova [rsp+0*mmsize], m1
+ mova [rsp+1*mmsize], m7
+%endif
+%ifidn %1, v
+ FILTER_STEP m4, m5, F8M, 3, %%p2, m1, m2, m7, reg_Q1
+%else
+ FILTER_STEP m4, m5, F8M, 3, %%p2, m1, m2, m7, reg_Q1, 1
+%endif
+ FILTER_STEP m4, m5, F8M, 3, %%p1, m7, m2, m0, m3, 1
+ FILTER_STEP m4, m5, F8M, 3, %%p0, m0, m2, reg_Q0, m6, 1
+%if ARCH_X86_64
+ FILTER_STEP m4, m5, F8M, 3, %%q0, reg_Q0, m8, reg_Q1, m6, ARCH_X86_64
+ FILTER_STEP m4, m5, F8M, 3, %%q1, reg_Q1, m9, m3, m6, ARCH_X86_64
+%else
+ FILTER_STEP m4, m5, F8M, 3, %%q0, reg_Q0, [rsp+0*mmsize], reg_Q1, m6, ARCH_X86_64
+ FILTER_STEP m4, m5, F8M, 3, %%q1, reg_Q1, [rsp+1*mmsize], m3, m6, ARCH_X86_64
+%endif
+ FILTER_STEP m4, m5, F8M, 3, %%q2, m3
+
+ UNSCRATCH 2, 10, %%q0
+ UNSCRATCH 6, 11, %%q1
+%else
+ SWAP 1, 7
+ mova m2, [%%q0]
+ mova m6, [%%q1]
+%endif
+ UNSCRATCH 3, 13, rsp+(%%off+4)*mmsize, HEV
+
+ ; m0=p0
+ ; m1=p2
+ ; m2=q0
+ ; m3=hev_mask
+ ; m4-5=free
+ ; m6=q1
+ ; m7=p1
+
+ ; filter_4
+ psubw m4, m7, m6 ; p1-q1
+ psubw m5, m2, m0 ; q0-p0
+ pand m4, m3
+ pminsw m4, [pw_ %+ %%maxsgn]
+ pmaxsw m4, [pw_ %+ %%minsgn] ; clip_intp2(p1-q1, 9) -> f
+ paddw m4, m5
+ paddw m5, m5
+ paddw m4, m5 ; 3*(q0-p0)+f
+ pminsw m4, [pw_ %+ %%maxsgn]
+ pmaxsw m4, [pw_ %+ %%minsgn] ; clip_intp2(3*(q0-p0)+f, 9) -> f
+ pand m4, reg_F4M
+ paddw m5, m4, [pw_4]
+ paddw m4, [pw_3]
+ pminsw m5, [pw_ %+ %%maxsgn]
+ pminsw m4, [pw_ %+ %%maxsgn]
+ psraw m5, 3 ; min_intp2(f+4, 9)>>3 -> f1
+ psraw m4, 3 ; min_intp2(f+3, 9)>>3 -> f2
+ psubw m2, m5 ; q0-f1
+ paddw m0, m4 ; p0+f2
+ pandn m3, m5 ; f1 & !hev (for p1/q1 adj)
+ pxor m4, m4
+ mova m5, [pw_ %+ %%maxusgn]
+ pmaxsw m2, m4
+ pmaxsw m0, m4
+ pminsw m2, m5
+ pminsw m0, m5
+%if cpuflag(ssse3)
+ pmulhrsw m3, [pw_16384] ; (f1+1)>>1
+%else
+ paddw m3, [pw_1]
+ psraw m3, 1
+%endif
+ paddw m7, m3 ; p1+f
+ psubw m6, m3 ; q1-f
+ pmaxsw m7, m4
+ pmaxsw m6, m4
+ pminsw m7, m5
+ pminsw m6, m5
+
+ ; store
+%ifidn %1, v
+ mova [%%p1], m7
+ mova [%%p0], m0
+ mova [%%q0], m2
+ mova [%%q1], m6
+%else ; %1 == h
+%if %2 == 4
+ TRANSPOSE4x4W 7, 0, 2, 6, 1
+ movh [dst0q+strideq*0-4], m7
+ movhps [dst0q+strideq*1-4], m7
+ movh [dst0q+strideq*2-4], m0
+ movhps [dst0q+stride3q -4], m0
+ movh [dst4q+strideq*0-4], m2
+ movhps [dst4q+strideq*1-4], m2
+ movh [dst4q+strideq*2-4], m6
+ movhps [dst4q+stride3q -4], m6
+%elif %2 == 8
+ mova m3, [%%p3]
+ mova m4, [%%q2]
+ mova m5, [%%q3]
+
+%if ARCH_X86_64
+ TRANSPOSE8x8W 3, 1, 7, 0, 2, 6, 4, 5, 8
+%else
+ TRANSPOSE8x8W 3, 1, 7, 0, 2, 6, 4, 5, [%%q2], [%%q0], 1
+ mova m2, [%%q0]
+%endif
+
+ movu [dst0q+strideq*0-8], m3
+ movu [dst0q+strideq*1-8], m1
+ movu [dst0q+strideq*2-8], m7
+ movu [dst0q+stride3q -8], m0
+ movu [dst4q+strideq*0-8], m2
+ movu [dst4q+strideq*1-8], m6
+ movu [dst4q+strideq*2-8], m4
+ movu [dst4q+stride3q -8], m5
+%else ; %2 == 16
+ SCRATCH 2, 8, %%q0
+ SCRATCH 6, 9, %%q1
+ mova m2, [%%p7]
+ mova m3, [%%p6]
+ mova m4, [%%p5]
+ mova m5, [%%p4]
+ mova m6, [%%p3]
+
+%if ARCH_X86_64
+ TRANSPOSE8x8W 2, 3, 4, 5, 6, 1, 7, 0, 10
+%else
+ mova [%%p1], m7
+ TRANSPOSE8x8W 2, 3, 4, 5, 6, 1, 7, 0, [%%p1], [dst4q+strideq*0-16], 1
+%endif
+
+ mova [dst0q+strideq*0-16], m2
+ mova [dst0q+strideq*1-16], m3
+ mova [dst0q+strideq*2-16], m4
+ mova [dst0q+stride3q -16], m5
+%if ARCH_X86_64
+ mova [dst4q+strideq*0-16], m6
+%endif
+ mova [dst4q+strideq*1-16], m1
+ mova [dst4q+strideq*2-16], m7
+ mova [dst4q+stride3q -16], m0
+
+ UNSCRATCH 2, 8, %%q0
+ UNSCRATCH 6, 9, %%q1
+ mova m0, [%%q2]
+ mova m1, [%%q3]
+ mova m3, [%%q4]
+ mova m4, [%%q5]
+%if ARCH_X86_64
+ mova m5, [%%q6]
+%endif
+ mova m7, [%%q7]
+
+%if ARCH_X86_64
+ TRANSPOSE8x8W 2, 6, 0, 1, 3, 4, 5, 7, 8
+%else
+ TRANSPOSE8x8W 2, 6, 0, 1, 3, 4, 5, 7, [%%q6], [dst4q+strideq*0], 1
+%endif
+
+ mova [dst0q+strideq*0], m2
+ mova [dst0q+strideq*1], m6
+ mova [dst0q+strideq*2], m0
+ mova [dst0q+stride3q ], m1
+%if ARCH_X86_64
+ mova [dst4q+strideq*0], m3
+%endif
+ mova [dst4q+strideq*1], m4
+ mova [dst4q+strideq*2], m5
+ mova [dst4q+stride3q ], m7
+%endif ; %2
+%endif ; %1
+ RET
+%endmacro
+
+%macro LOOP_FILTER_CPUSETS 3
+INIT_XMM sse2
+LOOP_FILTER %1, %2, %3
+INIT_XMM ssse3
+LOOP_FILTER %1, %2, %3
+INIT_XMM avx
+LOOP_FILTER %1, %2, %3
+%endmacro
+
+%macro LOOP_FILTER_WDSETS 2
+LOOP_FILTER_CPUSETS %1, 4, %2
+LOOP_FILTER_CPUSETS %1, 8, %2
+LOOP_FILTER_CPUSETS %1, 16, %2
+%endmacro
+
+LOOP_FILTER_WDSETS h, 10
+LOOP_FILTER_WDSETS v, 10
+LOOP_FILTER_WDSETS h, 12
+LOOP_FILTER_WDSETS v, 12
diff --git a/libs/ffvpx/libavcodec/x86/vp9mc.asm b/libs/ffvpx/libavcodec/x86/vp9mc.asm
new file mode 100644
index 000000000..f64161b2c
--- /dev/null
+++ b/libs/ffvpx/libavcodec/x86/vp9mc.asm
@@ -0,0 +1,675 @@
+;******************************************************************************
+;* VP9 motion compensation SIMD optimizations
+;*
+;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+cextern pw_256
+cextern pw_64
+
+%macro F8_SSSE3_TAPS 8
+times 16 db %1, %2
+times 16 db %3, %4
+times 16 db %5, %6
+times 16 db %7, %8
+%endmacro
+
+%macro F8_SSE2_TAPS 8
+times 8 dw %1
+times 8 dw %2
+times 8 dw %3
+times 8 dw %4
+times 8 dw %5
+times 8 dw %6
+times 8 dw %7
+times 8 dw %8
+%endmacro
+
+%macro F8_16BPP_TAPS 8
+times 8 dw %1, %2
+times 8 dw %3, %4
+times 8 dw %5, %6
+times 8 dw %7, %8
+%endmacro
+
+%macro FILTER 1
+const filters_%1 ; smooth
+ F8_TAPS -3, -1, 32, 64, 38, 1, -3, 0
+ F8_TAPS -2, -2, 29, 63, 41, 2, -3, 0
+ F8_TAPS -2, -2, 26, 63, 43, 4, -4, 0
+ F8_TAPS -2, -3, 24, 62, 46, 5, -4, 0
+ F8_TAPS -2, -3, 21, 60, 49, 7, -4, 0
+ F8_TAPS -1, -4, 18, 59, 51, 9, -4, 0
+ F8_TAPS -1, -4, 16, 57, 53, 12, -4, -1
+ F8_TAPS -1, -4, 14, 55, 55, 14, -4, -1
+ F8_TAPS -1, -4, 12, 53, 57, 16, -4, -1
+ F8_TAPS 0, -4, 9, 51, 59, 18, -4, -1
+ F8_TAPS 0, -4, 7, 49, 60, 21, -3, -2
+ F8_TAPS 0, -4, 5, 46, 62, 24, -3, -2
+ F8_TAPS 0, -4, 4, 43, 63, 26, -2, -2
+ F8_TAPS 0, -3, 2, 41, 63, 29, -2, -2
+ F8_TAPS 0, -3, 1, 38, 64, 32, -1, -3
+ ; regular
+ F8_TAPS 0, 1, -5, 126, 8, -3, 1, 0
+ F8_TAPS -1, 3, -10, 122, 18, -6, 2, 0
+ F8_TAPS -1, 4, -13, 118, 27, -9, 3, -1
+ F8_TAPS -1, 4, -16, 112, 37, -11, 4, -1
+ F8_TAPS -1, 5, -18, 105, 48, -14, 4, -1
+ F8_TAPS -1, 5, -19, 97, 58, -16, 5, -1
+ F8_TAPS -1, 6, -19, 88, 68, -18, 5, -1
+ F8_TAPS -1, 6, -19, 78, 78, -19, 6, -1
+ F8_TAPS -1, 5, -18, 68, 88, -19, 6, -1
+ F8_TAPS -1, 5, -16, 58, 97, -19, 5, -1
+ F8_TAPS -1, 4, -14, 48, 105, -18, 5, -1
+ F8_TAPS -1, 4, -11, 37, 112, -16, 4, -1
+ F8_TAPS -1, 3, -9, 27, 118, -13, 4, -1
+ F8_TAPS 0, 2, -6, 18, 122, -10, 3, -1
+ F8_TAPS 0, 1, -3, 8, 126, -5, 1, 0
+ ; sharp
+ F8_TAPS -1, 3, -7, 127, 8, -3, 1, 0
+ F8_TAPS -2, 5, -13, 125, 17, -6, 3, -1
+ F8_TAPS -3, 7, -17, 121, 27, -10, 5, -2
+ F8_TAPS -4, 9, -20, 115, 37, -13, 6, -2
+ F8_TAPS -4, 10, -23, 108, 48, -16, 8, -3
+ F8_TAPS -4, 10, -24, 100, 59, -19, 9, -3
+ F8_TAPS -4, 11, -24, 90, 70, -21, 10, -4
+ F8_TAPS -4, 11, -23, 80, 80, -23, 11, -4
+ F8_TAPS -4, 10, -21, 70, 90, -24, 11, -4
+ F8_TAPS -3, 9, -19, 59, 100, -24, 10, -4
+ F8_TAPS -3, 8, -16, 48, 108, -23, 10, -4
+ F8_TAPS -2, 6, -13, 37, 115, -20, 9, -4
+ F8_TAPS -2, 5, -10, 27, 121, -17, 7, -3
+ F8_TAPS -1, 3, -6, 17, 125, -13, 5, -2
+ F8_TAPS 0, 1, -3, 8, 127, -7, 3, -1
+%endmacro
+
+%define F8_TAPS F8_SSSE3_TAPS
+; int8_t ff_filters_ssse3[3][15][4][32]
+FILTER ssse3
+%define F8_TAPS F8_SSE2_TAPS
+; int16_t ff_filters_sse2[3][15][8][8]
+FILTER sse2
+%define F8_TAPS F8_16BPP_TAPS
+; int16_t ff_filters_16bpp[3][15][4][16]
+FILTER 16bpp
+
+SECTION .text
+
+%macro filter_sse2_h_fn 1
+%assign %%px mmsize/2
+cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 15, dst, dstride, src, sstride, h, filtery
+ pxor m5, m5
+ mova m6, [pw_64]
+ mova m7, [filteryq+ 0]
+%if ARCH_X86_64 && mmsize > 8
+ mova m8, [filteryq+ 16]
+ mova m9, [filteryq+ 32]
+ mova m10, [filteryq+ 48]
+ mova m11, [filteryq+ 64]
+ mova m12, [filteryq+ 80]
+ mova m13, [filteryq+ 96]
+ mova m14, [filteryq+112]
+%endif
+.loop:
+ movh m0, [srcq-3]
+ movh m1, [srcq-2]
+ movh m2, [srcq-1]
+ movh m3, [srcq+0]
+ movh m4, [srcq+1]
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+ punpcklbw m2, m5
+ punpcklbw m3, m5
+ punpcklbw m4, m5
+ pmullw m0, m7
+%if ARCH_X86_64 && mmsize > 8
+ pmullw m1, m8
+ pmullw m2, m9
+ pmullw m3, m10
+ pmullw m4, m11
+%else
+ pmullw m1, [filteryq+ 16]
+ pmullw m2, [filteryq+ 32]
+ pmullw m3, [filteryq+ 48]
+ pmullw m4, [filteryq+ 64]
+%endif
+ paddw m0, m1
+ paddw m2, m3
+ paddw m0, m4
+ movh m1, [srcq+2]
+ movh m3, [srcq+3]
+ movh m4, [srcq+4]
+ add srcq, sstrideq
+ punpcklbw m1, m5
+ punpcklbw m3, m5
+ punpcklbw m4, m5
+%if ARCH_X86_64 && mmsize > 8
+ pmullw m1, m12
+ pmullw m3, m13
+ pmullw m4, m14
+%else
+ pmullw m1, [filteryq+ 80]
+ pmullw m3, [filteryq+ 96]
+ pmullw m4, [filteryq+112]
+%endif
+ paddw m0, m1
+ paddw m3, m4
+ paddw m0, m6
+ paddw m2, m3
+ paddsw m0, m2
+ psraw m0, 7
+%ifidn %1, avg
+ movh m1, [dstq]
+%endif
+ packuswb m0, m0
+%ifidn %1, avg
+ pavgb m0, m1
+%endif
+ movh [dstq], m0
+ add dstq, dstrideq
+ dec hd
+ jg .loop
+ RET
+%endmacro
+
+INIT_MMX mmxext
+filter_sse2_h_fn put
+filter_sse2_h_fn avg
+
+INIT_XMM sse2
+filter_sse2_h_fn put
+filter_sse2_h_fn avg
+
+%macro filter_h_fn 1
+%assign %%px mmsize/2
+cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 11, dst, dstride, src, sstride, h, filtery
+ mova m6, [pw_256]
+ mova m7, [filteryq+ 0]
+%if ARCH_X86_64 && mmsize > 8
+ mova m8, [filteryq+32]
+ mova m9, [filteryq+64]
+ mova m10, [filteryq+96]
+%endif
+.loop:
+ movh m0, [srcq-3]
+ movh m1, [srcq-2]
+ movh m2, [srcq-1]
+ movh m3, [srcq+0]
+ movh m4, [srcq+1]
+ movh m5, [srcq+2]
+ punpcklbw m0, m1
+ punpcklbw m2, m3
+ movh m1, [srcq+3]
+ movh m3, [srcq+4]
+ add srcq, sstrideq
+ punpcklbw m4, m5
+ punpcklbw m1, m3
+ pmaddubsw m0, m7
+%if ARCH_X86_64 && mmsize > 8
+ pmaddubsw m2, m8
+ pmaddubsw m4, m9
+ pmaddubsw m1, m10
+%else
+ pmaddubsw m2, [filteryq+32]
+ pmaddubsw m4, [filteryq+64]
+ pmaddubsw m1, [filteryq+96]
+%endif
+ paddw m0, m4
+ paddw m2, m1
+ paddsw m0, m2
+ pmulhrsw m0, m6
+%ifidn %1, avg
+ movh m1, [dstq]
+%endif
+ packuswb m0, m0
+%ifidn %1, avg
+ pavgb m0, m1
+%endif
+ movh [dstq], m0
+ add dstq, dstrideq
+ dec hd
+ jg .loop
+ RET
+%endmacro
+
+INIT_MMX ssse3
+filter_h_fn put
+filter_h_fn avg
+
+INIT_XMM ssse3
+filter_h_fn put
+filter_h_fn avg
+
+%if ARCH_X86_64
+%macro filter_hx2_fn 1
+%assign %%px mmsize
+cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 14, dst, dstride, src, sstride, h, filtery
+ mova m13, [pw_256]
+ mova m8, [filteryq+ 0]
+ mova m9, [filteryq+32]
+ mova m10, [filteryq+64]
+ mova m11, [filteryq+96]
+.loop:
+ movu m0, [srcq-3]
+ movu m1, [srcq-2]
+ movu m2, [srcq-1]
+ movu m3, [srcq+0]
+ movu m4, [srcq+1]
+ movu m5, [srcq+2]
+ movu m6, [srcq+3]
+ movu m7, [srcq+4]
+ add srcq, sstrideq
+ SBUTTERFLY bw, 0, 1, 12
+ SBUTTERFLY bw, 2, 3, 12
+ SBUTTERFLY bw, 4, 5, 12
+ SBUTTERFLY bw, 6, 7, 12
+ pmaddubsw m0, m8
+ pmaddubsw m1, m8
+ pmaddubsw m2, m9
+ pmaddubsw m3, m9
+ pmaddubsw m4, m10
+ pmaddubsw m5, m10
+ pmaddubsw m6, m11
+ pmaddubsw m7, m11
+ paddw m0, m4
+ paddw m1, m5
+ paddw m2, m6
+ paddw m3, m7
+ paddsw m0, m2
+ paddsw m1, m3
+ pmulhrsw m0, m13
+ pmulhrsw m1, m13
+ packuswb m0, m1
+%ifidn %1, avg
+ pavgb m0, [dstq]
+%endif
+ mova [dstq], m0
+ add dstq, dstrideq
+ dec hd
+ jg .loop
+ RET
+%endmacro
+
+INIT_XMM ssse3
+filter_hx2_fn put
+filter_hx2_fn avg
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+filter_hx2_fn put
+filter_hx2_fn avg
+%endif
+
+%endif ; ARCH_X86_64
+
+%macro filter_sse2_v_fn 1
+%assign %%px mmsize/2
+%if ARCH_X86_64
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 6, 8, 15, dst, dstride, src, sstride, h, filtery, src4, sstride3
+%else
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 4, 7, 15, dst, dstride, src, sstride, filtery, src4, sstride3
+ mov filteryq, r5mp
+%define hd r4mp
+%endif
+ pxor m5, m5
+ mova m6, [pw_64]
+ lea sstride3q, [sstrideq*3]
+ lea src4q, [srcq+sstrideq]
+ sub srcq, sstride3q
+ mova m7, [filteryq+ 0]
+%if ARCH_X86_64 && mmsize > 8
+ mova m8, [filteryq+ 16]
+ mova m9, [filteryq+ 32]
+ mova m10, [filteryq+ 48]
+ mova m11, [filteryq+ 64]
+ mova m12, [filteryq+ 80]
+ mova m13, [filteryq+ 96]
+ mova m14, [filteryq+112]
+%endif
+.loop:
+ ; FIXME maybe reuse loads from previous rows, or just
+ ; more generally unroll this to prevent multiple loads of
+ ; the same data?
+ movh m0, [srcq]
+ movh m1, [srcq+sstrideq]
+ movh m2, [srcq+sstrideq*2]
+ movh m3, [srcq+sstride3q]
+ add srcq, sstrideq
+ movh m4, [src4q]
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+ punpcklbw m2, m5
+ punpcklbw m3, m5
+ punpcklbw m4, m5
+ pmullw m0, m7
+%if ARCH_X86_64 && mmsize > 8
+ pmullw m1, m8
+ pmullw m2, m9
+ pmullw m3, m10
+ pmullw m4, m11
+%else
+ pmullw m1, [filteryq+ 16]
+ pmullw m2, [filteryq+ 32]
+ pmullw m3, [filteryq+ 48]
+ pmullw m4, [filteryq+ 64]
+%endif
+ paddw m0, m1
+ paddw m2, m3
+ paddw m0, m4
+ movh m1, [src4q+sstrideq]
+ movh m3, [src4q+sstrideq*2]
+ movh m4, [src4q+sstride3q]
+ add src4q, sstrideq
+ punpcklbw m1, m5
+ punpcklbw m3, m5
+ punpcklbw m4, m5
+%if ARCH_X86_64 && mmsize > 8
+ pmullw m1, m12
+ pmullw m3, m13
+ pmullw m4, m14
+%else
+ pmullw m1, [filteryq+ 80]
+ pmullw m3, [filteryq+ 96]
+ pmullw m4, [filteryq+112]
+%endif
+ paddw m0, m1
+ paddw m3, m4
+ paddw m0, m6
+ paddw m2, m3
+ paddsw m0, m2
+ psraw m0, 7
+%ifidn %1, avg
+ movh m1, [dstq]
+%endif
+ packuswb m0, m0
+%ifidn %1, avg
+ pavgb m0, m1
+%endif
+ movh [dstq], m0
+ add dstq, dstrideq
+ dec hd
+ jg .loop
+ RET
+%endmacro
+
+INIT_MMX mmxext
+filter_sse2_v_fn put
+filter_sse2_v_fn avg
+
+INIT_XMM sse2
+filter_sse2_v_fn put
+filter_sse2_v_fn avg
+
+%macro filter_v_fn 1
+%assign %%px mmsize/2
+%if ARCH_X86_64
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 6, 8, 11, dst, dstride, src, sstride, h, filtery, src4, sstride3
+%else
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 4, 7, 11, dst, dstride, src, sstride, filtery, src4, sstride3
+ mov filteryq, r5mp
+%define hd r4mp
+%endif
+ mova m6, [pw_256]
+ lea sstride3q, [sstrideq*3]
+ lea src4q, [srcq+sstrideq]
+ sub srcq, sstride3q
+ mova m7, [filteryq+ 0]
+%if ARCH_X86_64 && mmsize > 8
+ mova m8, [filteryq+32]
+ mova m9, [filteryq+64]
+ mova m10, [filteryq+96]
+%endif
+.loop:
+ ; FIXME maybe reuse loads from previous rows, or just more generally
+ ; unroll this to prevent multiple loads of the same data?
+ movh m0, [srcq]
+ movh m1, [srcq+sstrideq]
+ movh m2, [srcq+sstrideq*2]
+ movh m3, [srcq+sstride3q]
+ movh m4, [src4q]
+ movh m5, [src4q+sstrideq]
+ punpcklbw m0, m1
+ punpcklbw m2, m3
+ movh m1, [src4q+sstrideq*2]
+ movh m3, [src4q+sstride3q]
+ add srcq, sstrideq
+ add src4q, sstrideq
+ punpcklbw m4, m5
+ punpcklbw m1, m3
+ pmaddubsw m0, m7
+%if ARCH_X86_64 && mmsize > 8
+ pmaddubsw m2, m8
+ pmaddubsw m4, m9
+ pmaddubsw m1, m10
+%else
+ pmaddubsw m2, [filteryq+32]
+ pmaddubsw m4, [filteryq+64]
+ pmaddubsw m1, [filteryq+96]
+%endif
+ paddw m0, m4
+ paddw m2, m1
+ paddsw m0, m2
+ pmulhrsw m0, m6
+%ifidn %1, avg
+ movh m1, [dstq]
+%endif
+ packuswb m0, m0
+%ifidn %1, avg
+ pavgb m0, m1
+%endif
+ movh [dstq], m0
+ add dstq, dstrideq
+ dec hd
+ jg .loop
+ RET
+%endmacro
+
+INIT_MMX ssse3
+filter_v_fn put
+filter_v_fn avg
+
+INIT_XMM ssse3
+filter_v_fn put
+filter_v_fn avg
+
+%if ARCH_X86_64
+
+%macro filter_vx2_fn 1
+%assign %%px mmsize
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 6, 8, 14, dst, dstride, src, sstride, h, filtery, src4, sstride3
+ mova m13, [pw_256]
+ lea sstride3q, [sstrideq*3]
+ lea src4q, [srcq+sstrideq]
+ sub srcq, sstride3q
+ mova m8, [filteryq+ 0]
+ mova m9, [filteryq+32]
+ mova m10, [filteryq+64]
+ mova m11, [filteryq+96]
+.loop:
+ ; FIXME maybe reuse loads from previous rows, or just
+ ; more generally unroll this to prevent multiple loads of
+ ; the same data?
+ movu m0, [srcq]
+ movu m1, [srcq+sstrideq]
+ movu m2, [srcq+sstrideq*2]
+ movu m3, [srcq+sstride3q]
+ movu m4, [src4q]
+ movu m5, [src4q+sstrideq]
+ movu m6, [src4q+sstrideq*2]
+ movu m7, [src4q+sstride3q]
+ add srcq, sstrideq
+ add src4q, sstrideq
+ SBUTTERFLY bw, 0, 1, 12
+ SBUTTERFLY bw, 2, 3, 12
+ SBUTTERFLY bw, 4, 5, 12
+ SBUTTERFLY bw, 6, 7, 12
+ pmaddubsw m0, m8
+ pmaddubsw m1, m8
+ pmaddubsw m2, m9
+ pmaddubsw m3, m9
+ pmaddubsw m4, m10
+ pmaddubsw m5, m10
+ pmaddubsw m6, m11
+ pmaddubsw m7, m11
+ paddw m0, m4
+ paddw m1, m5
+ paddw m2, m6
+ paddw m3, m7
+ paddsw m0, m2
+ paddsw m1, m3
+ pmulhrsw m0, m13
+ pmulhrsw m1, m13
+ packuswb m0, m1
+%ifidn %1, avg
+ pavgb m0, [dstq]
+%endif
+ mova [dstq], m0
+ add dstq, dstrideq
+ dec hd
+ jg .loop
+ RET
+%endmacro
+
+INIT_XMM ssse3
+filter_vx2_fn put
+filter_vx2_fn avg
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+filter_vx2_fn put
+filter_vx2_fn avg
+%endif
+
+%endif ; ARCH_X86_64
+
+%macro fpel_fn 6-8 0, 4
+%if %2 == 4
+%define %%srcfn movh
+%define %%dstfn movh
+%else
+%define %%srcfn movu
+%define %%dstfn mova
+%endif
+
+%if %7 == 8
+%define %%pavg pavgb
+%define %%szsuf _8
+%elif %7 == 16
+%define %%pavg pavgw
+%define %%szsuf _16
+%else
+%define %%szsuf
+%endif
+
+%if %2 <= mmsize
+cglobal vp9_%1%2 %+ %%szsuf, 5, 7, 4, dst, dstride, src, sstride, h, dstride3, sstride3
+ lea sstride3q, [sstrideq*3]
+ lea dstride3q, [dstrideq*3]
+%else
+cglobal vp9_%1%2 %+ %%szsuf, 5, 5, %8, dst, dstride, src, sstride, h
+%endif
+.loop:
+ %%srcfn m0, [srcq]
+ %%srcfn m1, [srcq+s%3]
+ %%srcfn m2, [srcq+s%4]
+ %%srcfn m3, [srcq+s%5]
+%if %2/mmsize == 8
+ %%srcfn m4, [srcq+mmsize*4]
+ %%srcfn m5, [srcq+mmsize*5]
+ %%srcfn m6, [srcq+mmsize*6]
+ %%srcfn m7, [srcq+mmsize*7]
+%endif
+ lea srcq, [srcq+sstrideq*%6]
+%ifidn %1, avg
+ %%pavg m0, [dstq]
+ %%pavg m1, [dstq+d%3]
+ %%pavg m2, [dstq+d%4]
+ %%pavg m3, [dstq+d%5]
+%if %2/mmsize == 8
+ %%pavg m4, [dstq+mmsize*4]
+ %%pavg m5, [dstq+mmsize*5]
+ %%pavg m6, [dstq+mmsize*6]
+ %%pavg m7, [dstq+mmsize*7]
+%endif
+%endif
+ %%dstfn [dstq], m0
+ %%dstfn [dstq+d%3], m1
+ %%dstfn [dstq+d%4], m2
+ %%dstfn [dstq+d%5], m3
+%if %2/mmsize == 8
+ %%dstfn [dstq+mmsize*4], m4
+ %%dstfn [dstq+mmsize*5], m5
+ %%dstfn [dstq+mmsize*6], m6
+ %%dstfn [dstq+mmsize*7], m7
+%endif
+ lea dstq, [dstq+dstrideq*%6]
+ sub hd, %6
+ jnz .loop
+ RET
+%endmacro
+
+%define d16 16
+%define s16 16
+%define d32 32
+%define s32 32
+INIT_MMX mmx
+fpel_fn put, 4, strideq, strideq*2, stride3q, 4
+fpel_fn put, 8, strideq, strideq*2, stride3q, 4
+INIT_MMX mmxext
+fpel_fn avg, 4, strideq, strideq*2, stride3q, 4, 8
+fpel_fn avg, 8, strideq, strideq*2, stride3q, 4, 8
+INIT_XMM sse
+fpel_fn put, 16, strideq, strideq*2, stride3q, 4
+fpel_fn put, 32, mmsize, strideq, strideq+mmsize, 2
+fpel_fn put, 64, mmsize, mmsize*2, mmsize*3, 1
+fpel_fn put, 128, mmsize, mmsize*2, mmsize*3, 1, 0, 8
+INIT_XMM sse2
+fpel_fn avg, 16, strideq, strideq*2, stride3q, 4, 8
+fpel_fn avg, 32, mmsize, strideq, strideq+mmsize, 2, 8
+fpel_fn avg, 64, mmsize, mmsize*2, mmsize*3, 1, 8
+INIT_YMM avx
+fpel_fn put, 32, strideq, strideq*2, stride3q, 4
+fpel_fn put, 64, mmsize, strideq, strideq+mmsize, 2
+fpel_fn put, 128, mmsize, mmsize*2, mmsize*3, 1
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+fpel_fn avg, 32, strideq, strideq*2, stride3q, 4, 8
+fpel_fn avg, 64, mmsize, strideq, strideq+mmsize, 2, 8
+%endif
+INIT_MMX mmxext
+fpel_fn avg, 8, strideq, strideq*2, stride3q, 4, 16
+INIT_XMM sse2
+fpel_fn avg, 16, strideq, strideq*2, stride3q, 4, 16
+fpel_fn avg, 32, mmsize, strideq, strideq+mmsize, 2, 16
+fpel_fn avg, 64, mmsize, mmsize*2, mmsize*3, 1, 16
+fpel_fn avg, 128, mmsize, mmsize*2, mmsize*3, 1, 16, 8
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+fpel_fn avg, 32, strideq, strideq*2, stride3q, 4, 16
+fpel_fn avg, 64, mmsize, strideq, strideq+mmsize, 2, 16
+fpel_fn avg, 128, mmsize, mmsize*2, mmsize*3, 1, 16
+%endif
+%undef s16
+%undef d16
+%undef s32
+%undef d32
diff --git a/libs/ffvpx/libavcodec/x86/vp9mc_16bpp.asm b/libs/ffvpx/libavcodec/x86/vp9mc_16bpp.asm
new file mode 100644
index 000000000..9a462eaf8
--- /dev/null
+++ b/libs/ffvpx/libavcodec/x86/vp9mc_16bpp.asm
@@ -0,0 +1,431 @@
+;******************************************************************************
+;* VP9 MC SIMD optimizations
+;*
+;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pd_64: times 8 dd 64
+
+cextern pw_1023
+cextern pw_4095
+
+SECTION .text
+
+%macro filter_h4_fn 1-2 12
+cglobal vp9_%1_8tap_1d_h_4_10, 6, 6, %2, dst, dstride, src, sstride, h, filtery
+ mova m5, [pw_1023]
+.body:
+%if notcpuflag(sse4) && ARCH_X86_64
+ pxor m11, m11
+%endif
+ mova m6, [pd_64]
+ mova m7, [filteryq+ 0]
+%if ARCH_X86_64 && mmsize > 8
+ mova m8, [filteryq+32]
+ mova m9, [filteryq+64]
+ mova m10, [filteryq+96]
+%endif
+.loop:
+ movh m0, [srcq-6]
+ movh m1, [srcq-4]
+ movh m2, [srcq-2]
+ movh m3, [srcq+0]
+ movh m4, [srcq+2]
+ punpcklwd m0, m1
+ punpcklwd m2, m3
+ pmaddwd m0, m7
+%if ARCH_X86_64 && mmsize > 8
+ pmaddwd m2, m8
+%else
+ pmaddwd m2, [filteryq+32]
+%endif
+ movu m1, [srcq+4]
+ movu m3, [srcq+6]
+ paddd m0, m2
+ movu m2, [srcq+8]
+ add srcq, sstrideq
+ punpcklwd m4, m1
+ punpcklwd m3, m2
+%if ARCH_X86_64 && mmsize > 8
+ pmaddwd m4, m9
+ pmaddwd m3, m10
+%else
+ pmaddwd m4, [filteryq+64]
+ pmaddwd m3, [filteryq+96]
+%endif
+ paddd m0, m4
+ paddd m0, m3
+ paddd m0, m6
+ psrad m0, 7
+%if cpuflag(sse4)
+ packusdw m0, m0
+%else
+ packssdw m0, m0
+%endif
+%ifidn %1, avg
+ movh m1, [dstq]
+%endif
+ pminsw m0, m5
+%if notcpuflag(sse4)
+%if ARCH_X86_64
+ pmaxsw m0, m11
+%else
+ pxor m2, m2
+ pmaxsw m0, m2
+%endif
+%endif
+%ifidn %1, avg
+ pavgw m0, m1
+%endif
+ movh [dstq], m0
+ add dstq, dstrideq
+ dec hd
+ jg .loop
+ RET
+
+cglobal vp9_%1_8tap_1d_h_4_12, 6, 6, %2, dst, dstride, src, sstride, h, filtery
+ mova m5, [pw_4095]
+ jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_h_4_10 %+ SUFFIX).body
+%endmacro
+
+INIT_XMM sse2
+filter_h4_fn put
+filter_h4_fn avg
+
+%macro filter_h_fn 1-2 12
+%assign %%px mmsize/2
+cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _10, 6, 6, %2, dst, dstride, src, sstride, h, filtery
+ mova m5, [pw_1023]
+.body:
+%if notcpuflag(sse4) && ARCH_X86_64
+ pxor m11, m11
+%endif
+ mova m6, [pd_64]
+ mova m7, [filteryq+ 0]
+%if ARCH_X86_64 && mmsize > 8
+ mova m8, [filteryq+32]
+ mova m9, [filteryq+64]
+ mova m10, [filteryq+96]
+%endif
+.loop:
+ movu m0, [srcq-6]
+ movu m1, [srcq-4]
+ movu m2, [srcq-2]
+ movu m3, [srcq+0]
+ movu m4, [srcq+2]
+ pmaddwd m0, m7
+ pmaddwd m1, m7
+%if ARCH_X86_64 && mmsize > 8
+ pmaddwd m2, m8
+ pmaddwd m3, m8
+ pmaddwd m4, m9
+%else
+ pmaddwd m2, [filteryq+32]
+ pmaddwd m3, [filteryq+32]
+ pmaddwd m4, [filteryq+64]
+%endif
+ paddd m0, m2
+ paddd m1, m3
+ paddd m0, m4
+ movu m2, [srcq+4]
+ movu m3, [srcq+6]
+ movu m4, [srcq+8]
+ add srcq, sstrideq
+%if ARCH_X86_64 && mmsize > 8
+ pmaddwd m2, m9
+ pmaddwd m3, m10
+ pmaddwd m4, m10
+%else
+ pmaddwd m2, [filteryq+64]
+ pmaddwd m3, [filteryq+96]
+ pmaddwd m4, [filteryq+96]
+%endif
+ paddd m1, m2
+ paddd m0, m3
+ paddd m1, m4
+ paddd m0, m6
+ paddd m1, m6
+ psrad m0, 7
+ psrad m1, 7
+%if cpuflag(sse4)
+ packusdw m0, m0
+ packusdw m1, m1
+%else
+ packssdw m0, m0
+ packssdw m1, m1
+%endif
+ punpcklwd m0, m1
+ pminsw m0, m5
+%if notcpuflag(sse4)
+%if ARCH_X86_64
+ pmaxsw m0, m11
+%else
+ pxor m2, m2
+ pmaxsw m0, m2
+%endif
+%endif
+%ifidn %1, avg
+ pavgw m0, [dstq]
+%endif
+ mova [dstq], m0
+ add dstq, dstrideq
+ dec hd
+ jg .loop
+ RET
+
+cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _12, 6, 6, %2, dst, dstride, src, sstride, h, filtery
+ mova m5, [pw_4095]
+ jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_h_ %+ %%px %+ _10 %+ SUFFIX).body
+%endmacro
+
+INIT_XMM sse2
+filter_h_fn put
+filter_h_fn avg
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+filter_h_fn put
+filter_h_fn avg
+%endif
+
+%macro filter_v4_fn 1-2 12
+%if ARCH_X86_64
+cglobal vp9_%1_8tap_1d_v_4_10, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
+%else
+cglobal vp9_%1_8tap_1d_v_4_10, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
+ mov filteryq, r5mp
+%define hd r4mp
+%endif
+ mova m5, [pw_1023]
+.body:
+%if notcpuflag(sse4) && ARCH_X86_64
+ pxor m11, m11
+%endif
+ mova m6, [pd_64]
+ lea sstride3q, [sstrideq*3]
+ lea src4q, [srcq+sstrideq]
+ sub srcq, sstride3q
+ mova m7, [filteryq+ 0]
+%if ARCH_X86_64 && mmsize > 8
+ mova m8, [filteryq+ 32]
+ mova m9, [filteryq+ 64]
+ mova m10, [filteryq+ 96]
+%endif
+.loop:
+ ; FIXME maybe reuse loads from previous rows, or just
+ ; more generally unroll this to prevent multiple loads of
+ ; the same data?
+ movh m0, [srcq]
+ movh m1, [srcq+sstrideq]
+ movh m2, [srcq+sstrideq*2]
+ movh m3, [srcq+sstride3q]
+ add srcq, sstrideq
+ movh m4, [src4q]
+ punpcklwd m0, m1
+ punpcklwd m2, m3
+ pmaddwd m0, m7
+%if ARCH_X86_64 && mmsize > 8
+ pmaddwd m2, m8
+%else
+ pmaddwd m2, [filteryq+ 32]
+%endif
+ movh m1, [src4q+sstrideq]
+ movh m3, [src4q+sstrideq*2]
+ paddd m0, m2
+ movh m2, [src4q+sstride3q]
+ add src4q, sstrideq
+ punpcklwd m4, m1
+ punpcklwd m3, m2
+%if ARCH_X86_64 && mmsize > 8
+ pmaddwd m4, m9
+ pmaddwd m3, m10
+%else
+ pmaddwd m4, [filteryq+ 64]
+ pmaddwd m3, [filteryq+ 96]
+%endif
+ paddd m0, m4
+ paddd m0, m3
+ paddd m0, m6
+ psrad m0, 7
+%if cpuflag(sse4)
+ packusdw m0, m0
+%else
+ packssdw m0, m0
+%endif
+%ifidn %1, avg
+ movh m1, [dstq]
+%endif
+ pminsw m0, m5
+%if notcpuflag(sse4)
+%if ARCH_X86_64
+ pmaxsw m0, m11
+%else
+ pxor m2, m2
+ pmaxsw m0, m2
+%endif
+%endif
+%ifidn %1, avg
+ pavgw m0, m1
+%endif
+ movh [dstq], m0
+ add dstq, dstrideq
+ dec hd
+ jg .loop
+ RET
+
+%if ARCH_X86_64
+cglobal vp9_%1_8tap_1d_v_4_12, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
+%else
+cglobal vp9_%1_8tap_1d_v_4_12, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
+ mov filteryq, r5mp
+%endif
+ mova m5, [pw_4095]
+ jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_v_4_10 %+ SUFFIX).body
+%endmacro
+
+INIT_XMM sse2
+filter_v4_fn put
+filter_v4_fn avg
+
+%macro filter_v_fn 1-2 13
+%assign %%px mmsize/2
+%if ARCH_X86_64
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _10, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
+%else
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _10, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
+ mov filteryq, r5mp
+%define hd r4mp
+%endif
+ mova m5, [pw_1023]
+.body:
+%if notcpuflag(sse4) && ARCH_X86_64
+ pxor m12, m12
+%endif
+%if ARCH_X86_64
+ mova m11, [pd_64]
+%endif
+ lea sstride3q, [sstrideq*3]
+ lea src4q, [srcq+sstrideq]
+ sub srcq, sstride3q
+ mova m7, [filteryq+ 0]
+%if ARCH_X86_64 && mmsize > 8
+ mova m8, [filteryq+ 32]
+ mova m9, [filteryq+ 64]
+ mova m10, [filteryq+ 96]
+%endif
+.loop:
+ ; FIXME maybe reuse loads from previous rows, or just
+ ; more generally unroll this to prevent multiple loads of
+ ; the same data?
+ movu m0, [srcq]
+ movu m1, [srcq+sstrideq]
+ movu m2, [srcq+sstrideq*2]
+ movu m3, [srcq+sstride3q]
+ add srcq, sstrideq
+ movu m4, [src4q]
+ SBUTTERFLY wd, 0, 1, 6
+ SBUTTERFLY wd, 2, 3, 6
+ pmaddwd m0, m7
+ pmaddwd m1, m7
+%if ARCH_X86_64 && mmsize > 8
+ pmaddwd m2, m8
+ pmaddwd m3, m8
+%else
+ pmaddwd m2, [filteryq+ 32]
+ pmaddwd m3, [filteryq+ 32]
+%endif
+ paddd m0, m2
+ paddd m1, m3
+ movu m2, [src4q+sstrideq]
+ movu m3, [src4q+sstrideq*2]
+ SBUTTERFLY wd, 4, 2, 6
+%if ARCH_X86_64 && mmsize > 8
+ pmaddwd m4, m9
+ pmaddwd m2, m9
+%else
+ pmaddwd m4, [filteryq+ 64]
+ pmaddwd m2, [filteryq+ 64]
+%endif
+ paddd m0, m4
+ paddd m1, m2
+ movu m4, [src4q+sstride3q]
+ add src4q, sstrideq
+ SBUTTERFLY wd, 3, 4, 6
+%if ARCH_X86_64 && mmsize > 8
+ pmaddwd m3, m10
+ pmaddwd m4, m10
+%else
+ pmaddwd m3, [filteryq+ 96]
+ pmaddwd m4, [filteryq+ 96]
+%endif
+ paddd m0, m3
+ paddd m1, m4
+%if ARCH_X86_64
+ paddd m0, m11
+ paddd m1, m11
+%else
+ paddd m0, [pd_64]
+ paddd m1, [pd_64]
+%endif
+ psrad m0, 7
+ psrad m1, 7
+%if cpuflag(sse4)
+ packusdw m0, m1
+%else
+ packssdw m0, m1
+%endif
+ pminsw m0, m5
+%if notcpuflag(sse4)
+%if ARCH_X86_64
+ pmaxsw m0, m12
+%else
+ pxor m2, m2
+ pmaxsw m0, m2
+%endif
+%endif
+%ifidn %1, avg
+ pavgw m0, [dstq]
+%endif
+ mova [dstq], m0
+ add dstq, dstrideq
+ dec hd
+ jg .loop
+ RET
+
+%if ARCH_X86_64
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _12, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
+%else
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _12, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
+ mov filteryq, r5mp
+%endif
+ mova m5, [pw_4095]
+ jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_v_ %+ %%px %+ _10 %+ SUFFIX).body
+%endmacro
+
+INIT_XMM sse2
+filter_v_fn put
+filter_v_fn avg
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+filter_v_fn put
+filter_v_fn avg
+%endif